Libraries used in the analysis

library(DESeq2)
library(RColorBrewer)
library(Biobase)
library(pheatmap)
library(annotables)
library(openxlsx)
library(rio)
library(tidyverse)



Importing feature counts table

counts_data<- read.table ("featurecounts.txt", header=T, row.names = 1)
head(counts_GSEA)
ABCDEFGHIJ0123456789
 
 
Geneid
<chr>
Chr
<chr>
1ENSG00000223972.5chr1;chr1;chr1;chr1;chr1;chr1;chr1;chr1;chr1
2ENSG00000227232.5chr1;chr1;chr1;chr1;chr1;chr1;chr1;chr1;chr1;chr1;chr1
3ENSG00000278267.1chr1
4ENSG00000243485.5chr1;chr1;chr1;chr1;chr1
5ENSG00000284332.1chr1
6ENSG00000237613.2chr1;chr1;chr1;chr1;chr1



Data Manipulation

head(counts_data) #view the counts_data table 
ABCDEFGHIJ0123456789
 
 
Chr
<chr>
ENSG00000223972.5chr1;chr1;chr1;chr1;chr1;chr1;chr1;chr1;chr1
ENSG00000227232.5chr1;chr1;chr1;chr1;chr1;chr1;chr1;chr1;chr1;chr1;chr1
ENSG00000278267.1chr1
ENSG00000243485.5chr1;chr1;chr1;chr1;chr1
ENSG00000284332.1chr1
ENSG00000237613.2chr1;chr1;chr1;chr1;chr1



removing the first 5 columns

counts_data <- counts_data [ ,6:ncol(counts_data)] 
#head(counts_data)



Renamaming Columns

colnames(counts_data) <- c("NHEM76","NHEM77","Sbcl2_64","Sbcl2_70","WM1158_68","WM1158_74","WM1366_66","WM1366_72","WM3211_65","WM3211_71","WM793_67","WM793_73","WM9_69","WM9_75")
head(counts_data)
ABCDEFGHIJ0123456789
 
 
NHEM76
<int>
NHEM77
<int>
Sbcl2_64
<int>
Sbcl2_70
<int>
WM1158_68
<int>
WM1158_74
<int>
WM1366_66
<int>
WM1366_72
<int>
ENSG00000223972.520212235
ENSG00000227232.579109135510
ENSG00000278267.110000102
ENSG00000243485.500111200
ENSG00000284332.100000000
ENSG00000237613.200000000



Reordering coloumns

# Getting columns names and positions
colnames(counts_data)
 [1] "NHEM76"    "NHEM77"    "Sbcl2_64"  "Sbcl2_70"  "WM1158_68"
 [6] "WM1158_74" "WM1366_66" "WM1366_72" "WM3211_65" "WM3211_71"
[11] "WM793_67"  "WM793_73"  "WM9_69"    "WM9_75"   
#Rearranging by columns positions 
counts_data <- counts_data[, c(1, 2, 3, 4, 7, 8, 9, 10, 11, 12, 13, 14, 5, 6)]

This step is important, to be able to group them when creating the metadata needed for the DESeq2 object.

head(counts_data)
ABCDEFGHIJ0123456789
 
 
NHEM76
<int>
NHEM77
<int>
Sbcl2_64
<int>
Sbcl2_70
<int>
WM1366_66
<int>
WM1366_72
<int>
WM3211_65
<int>
WM3211_71
<int>
ENSG00000223972.520213510
ENSG00000227232.57910951074
ENSG00000278267.110000200
ENSG00000243485.500110000
ENSG00000284332.100000000
ENSG00000237613.200000000



Pre-Filtering

Filtering out the lowly expressed genes and checking the dimension of the data set. We will make it more than 10 because lowly expressed genes are kind of hard to measure for differential expression e.g 1 read or 2 reads.

dim(counts_data) #before
[1] 58721    14
counts_data <- counts_data [rowSums(counts_data) > 10, ]
dim(counts_data) #after
[1] 26574    14

rows is reduced from 58721 to 26574


Getting rid off Enseml IDs version

  • For genes (i.e. Ensembl identifiers of the form ENSG*), the version number increments when the set of transcripts linked to a gene changes.

  • The “dot digit” representing distinct version numbers associated with stable Ensembl identifiers must be removed from the Ensembl gene id.Ensembl Documentation stable IDS.

  • This is important to be able to match them with grcm38 dataset in “annotables” package.

tmp=gsub("\\..*","",row.names(counts_data))
rownames(counts_data) <- tmp
head(counts_data)
ABCDEFGHIJ0123456789
 
 
NHEM76
<int>
NHEM77
<int>
Sbcl2_64
<int>
Sbcl2_70
<int>
WM1366_66
<int>
WM1366_72
<int>
WM3211_65
<int>
WM3211_71
<int>
WM793_67
<int>
ENSG00000223972202135101
ENSG0000022723279109510744
ENSG00000241860022225112
ENSG0000027945749191322265
ENSG00000228463112013105726
ENSG0000023709432557256111



DESeq2 analysis Workflow



Creating a metadata frame

colData <- data.frame(row.names=colnames(counts_data), group, con)
colData
ABCDEFGHIJ0123456789
 
 
group
<fctr>
con
<fctr>
NHEM76NHEMnormal
NHEM77NHEMnormal
Sbcl2_64Sbcl2radial
Sbcl2_70Sbcl2radial
WM1366_66WM1366vertical
WM1366_72WM1366vertical
WM3211_65WM3211vertical
WM3211_71WM3211vertical
WM793_67WM793vertical
WM793_73WM793vertical


colData variable will be our metadata

colData <- data.frame(row.names=colnames(counts_data), group, con)
colData
ABCDEFGHIJ0123456789
 
 
group
<fctr>
con
<fctr>
NHEM76NHEMnormal
NHEM77NHEMnormal
Sbcl2_64Sbcl2radial
Sbcl2_70Sbcl2radial
WM1366_66WM1366vertical
WM1366_72WM1366vertical
WM3211_65WM3211vertical
WM3211_71WM3211vertical
WM793_67WM793vertical
WM793_73WM793vertical
#Checking whether the row names in colData matches to column names in counts_data
all(colnames(counts_data) %in% rownames(colData))
[1] TRUE
# Ensure that the sample names are in the same order in both datasets,
all(colnames(counts_data) == rownames(colData))
[1] TRUE

If not, we can modify this with match() function.



Colors for plots

mycols <- brewer.pal(11, "Set3")[1:length(unique(group))]



Create DESeqDataSet Object

dds<- DESeqDataSetFromMatrix (countData= counts_data, colData=colData, design= ~ con)
design(dds)
~con



Checking the design of your DESEeqDataSets

design(dds)
~con

what is meant by normalized raw counts ?

Library depth, gene length, and RNA composition are three of the most important parameters to consider when normalizing count data.

In terms of gene length, a longer gene results in a longer transcript, which results in more fragments for sequencing. As a result, a longer gene with the same amount of expression will have more counts than a shorter gene. Therefore, when comparing the expression levels of various genes, the lengths of the genes must be taken into account.



Creating the estimateSizeFactors.

  • When adjusting for library size, the library’s composition is also crucial to consider. Many normalization methods that are not resistant to outliers can be skewed by a few highly expressed genes.

  • Normalization for the majority of genes would be skewed by the highly expressed DE gene if we only divided our counts by the total number of reads. As a result, we need to use a technique that is resistant to these outlier genes when performing a DE analysis.

  • DESeq2 normalizes using the “median of ratios” method. which accounts for library size when computing raw counts and is resistant to large numbers of differentially expressed genes.

  • For normalization, the raw counts of each sample will be divided by the sample-specific size factor.




Estimating size factor

  • We will create a data table with read counts normalized by library size

  • Because we have different library sizes for the same number of genes, we need to make a ratio between them, therefore we’ll multiply every position by this size factor, which we can achieve by estimating size factors.

dds <- estimateSizeFactors(dds)
sizeFactors(dds)# to view the size factors used for normalization 
   NHEM76    NHEM77  Sbcl2_64  Sbcl2_70 WM1366_66 WM1366_72 WM3211_65 
1.1675968 1.7287595 1.1541503 1.2013835 0.6304523 1.2699113 0.7400715 
WM3211_71  WM793_67  WM793_73    WM9_69    WM9_75 WM1158_68 WM1158_74 
0.8725916 1.0196053 1.0923370 1.5843539 0.4016549 1.4756025 0.8078638 
colData(dds)
DataFrame with 14 rows and 3 columns
             group        con sizeFactor
          <factor>   <factor>  <numeric>
NHEM76      NHEM     normal     1.167597
NHEM77      NHEM     normal     1.728759
Sbcl2_64    Sbcl2    radial     1.154150
Sbcl2_70    Sbcl2    radial     1.201383
WM1366_66   WM1366   vertical   0.630452
...            ...        ...        ...
WM793_73    WM793  vertical     1.092337
WM9_69      WM9    metastasis   1.584354
WM9_75      WM9    metastasis   0.401655
WM1158_68   WM1158 metastasis   1.475603
WM1158_74   WM1158 metastasis   0.807864
head(counts(dds)) #un-normalized
                NHEM76 NHEM77 Sbcl2_64 Sbcl2_70 WM1366_66 WM1366_72
ENSG00000223972      2      0        2        1         3         5
ENSG00000227232      7      9       10        9         5        10
ENSG00000241860      0      2        2        2         2         5
ENSG00000279457      4      9       19       13         2         2
ENSG00000228463     11     20        1        3        10         5
ENSG00000237094      3      2        5        5         7        25
                WM3211_65 WM3211_71 WM793_67 WM793_73 WM9_69 WM9_75
ENSG00000223972         1         0        1        1      0      2
ENSG00000227232         7         4        4        5      8      5
ENSG00000241860         1         1        2        2      4      0
ENSG00000279457         2         6        5        4     12      2
ENSG00000228463         7         2        6        2      7      3
ENSG00000237094         6        11        1        4     16      6
                WM1158_68 WM1158_74
ENSG00000223972         2         2
ENSG00000227232        13         5
ENSG00000241860         9         1
ENSG00000279457         7         3
ENSG00000228463         2         0
ENSG00000237094        20        15



Extract the normalized counts

Now we have the size factors been calculated, and added to the DESeq2 object, the normalized counts can be extracted from it using the counts() function.

norm_counts<- counts(dds, normalized = TRUE)
head(norm_counts)
                 NHEM76    NHEM77   Sbcl2_64   Sbcl2_70 WM1366_66
ENSG00000223972 1.71292  0.000000  1.7328766  0.8323737  4.758488
ENSG00000227232 5.99522  5.206045  8.6643831  7.4913633  7.930814
ENSG00000241860 0.00000  1.156899  1.7328766  1.6647474  3.172326
ENSG00000279457 3.42584  5.206045 16.4623279 10.8208581  3.172326
ENSG00000228463 9.42106 11.568989  0.8664383  2.4971211 15.861628
ENSG00000237094 2.56938  1.156899  4.3321916  4.1618685 11.103139
                WM1366_72 WM3211_65 WM3211_71  WM793_67  WM793_73
ENSG00000223972  3.937283  1.351221  0.000000 0.9807716 0.9154684
ENSG00000227232  7.874565  9.458546  4.584046 3.9230866 4.5773421
ENSG00000241860  3.937283  1.351221  1.146011 1.9615433 1.8309368
ENSG00000279457  1.574913  2.702442  6.876069 4.9038582 3.6618737
ENSG00000228463  3.937283  9.458546  2.292023 5.8846298 1.8309368
ENSG00000237094 19.686414  8.107325 12.606126 0.9807716 3.6618737
                   WM9_69    WM9_75 WM1158_68 WM1158_74
ENSG00000223972  0.000000  4.979399  1.355379  2.475665
ENSG00000227232  5.049377 12.448497  8.809960  6.189162
ENSG00000241860  2.524688  0.000000  6.099203  1.237832
ENSG00000279457  7.574065  4.979399  4.743825  3.713497
ENSG00000228463  4.418205  7.469098  1.355379  0.000000
ENSG00000237094 10.098754 14.938196 13.553785 18.567486
sum(is.na(norm_counts))
[1] 0

GSEA save normalized counts for the GSEA

gsea_file <- read_delim("ddsNormSF.txt", "\t", escape_double = FALSE, trim_ws = TRUE)
New names:
* `` -> `...1`
Rows: 26574 Columns: 15
-- Column specification ----------------------------------------------
Delimiter: "\t"
chr  (1): ...1
dbl (14): NHEM76, NHEM77, Sbcl2_64, Sbcl2_70, WM1366_66, WM1366_72...

i Use `spec()` to retrieve the full column specification for this data.
i Specify the column types or set `show_col_types = FALSE` to quiet this message.
head(gsea_file)
ABCDEFGHIJ0123456789
...1
<chr>
NHEM76
<dbl>
NHEM77
<dbl>
Sbcl2_64
<dbl>
Sbcl2_70
<dbl>
WM1366_66
<dbl>
WM1366_72
<dbl>
WM3211_65
<dbl>
WM3211_71
<dbl>
ENSG000002239721.712920.0000001.73287660.83237374.7584883.9372831.3512210.000000
ENSG000002272325.995225.2060458.66438317.49136337.9308147.8745659.4585464.584046
ENSG000002418600.000001.1568991.73287661.66474743.1723263.9372831.3512211.146011
ENSG000002794573.425845.20604516.462327910.82085813.1723261.5749132.7024426.876069
ENSG000002284639.4210611.5689890.86643832.497121115.8616283.9372839.4585462.292023
ENSG000002370942.569381.1568994.33219164.161868511.10313919.6864148.10732512.606126
write.table(gsea_file, file = "gsea_inputfile.txt", quote = FALSE, row.names = FALSE, sep = "\t")
#test12 <- read.delim("ddsNormSF2.txt")
#head(test12)
#read_delim("file.txt", "\t", escape_double = FALSE, trim_ws = TRUE)
#write.table(df, file = "df.txt", quote = FALSE, row.names = FALSE, sep = "\t")
#save the normalized counts in a table
#write.table(norm_counts, file = "C:/Users/melbe/Documents/R/R_Projects/Diff_Exp_thesis/ddsNormSF2.txt", sep = "\t", col.names = NA)

Now that we’ve normalized the counts, we can move on to the next step of our analysis. We can now compare the counts between the different samples because the counts have been normalized for library size.



Quality assessment of our samples

  • To measure the quality of our experiment, we can look at how the samples relate in terms of gene expression.

  • Visualization approaches for unsupervised clustering analyses, such as hierarchical heatmaps and principal component analysis PCA, are used to do this.

  • These QC approaches are used to determine how comparable the biological replicates are to one another, as well as to detect outlier samples and major sources of variation in the data set.

  • To better the visualization of the clustering, we should first use the log to transform the normalized counts before using these Visualization methods. DESeq2 applies a variance stabilizing transformation (VST) to RNA-Seq data, which is a logarithmic transformation that reduces variance across the mean.



Transform the normalized counts

The DESeq2 vst() function on the DESeq2 object can be used to transform the normalized counts. The blind = True argument indicates that the transformation should be blind to the sample information provided in the design formula; this argument should be stated during the quality evaluation.

vsd <- vst(dds, blind = TRUE)



Heatmaps

  • To assess the similarity and gene expression between different samples in a dataset, hierarchical clustering with heatmaps is used.

  • This method is used to determine how similar replicates are to one another and whether samples from various sample groups cluster together or separately. The heatmap is made by combining the gene expression correlation values for all pairwise combinations of samples in the data set, with 1 being the perfect correlation.

  • The heatmap’s colors reflect the correlation values, while the hierarchical tree shows which samples are more similar to one another. The biological replicates should be grouped together, whereas the sample conditions should be separated. Because the majority of genes should not be differentially expressed, samples should have a high correlation.

  • Samples having correlation values less than 0.8 may need to be investigated further to see if they are outliers or have contamination.



Extract the matrix of transformed counts

To make a heatmap, we’ll use the assay() function to extract the VST-transformed normalized counts as a matrix from a vsd object. The pairwise correlation values between each pair of samples can then be computed using the cor() function.

vsd_mat <- assay(vsd) 



Computing the correlation values between samples

vsd_cor <- cor(vsd_mat)



Plot the heatmap
  • To create the heatmap, we can use the pheatmap package after generating the correlation values.
  • The annotation arguments determine which meta data factors should be used as annotation bars. To select the condition column in colData1, we use the select () function from the dplyr package. The heat map’s output shows that the biological replicates are clustered together and the condition is separated.
pheatmap(vsd_cor, annotation = dplyr::select(colData, con))

The biological replicates are clustered together, while the samples from various conditions are clustered separately. There are no outliers or samples with low correlation values when compared to the rest of the data.



Pricipal component analysis PCA

  • In order to continue evaluating the quality of our samples, we will use PCA to see how our samples cluster and whether our condition of interest corresponds to the principal components explaining the most variation in the data.

  • PCA is a technique for emphasizing the variation in a dataset. The first principal component, or PCA1, represents the greatest amount of variance in the data.

  • we could plot the normalized counts of every gene for one sample in the x-axis and the other sample on the y-axis. PC2, the dataset’s second most variation, must be perpendicular to PC1 in order to best describe the variance in the dataset not included in PC1. PC2 has a much smaller spread.

  • The number of principal components in the dataset is equal to the number of samples, n. PC1 means plotting a line through n-dimensional space to find the greatest amount of variation.

  • The principal component with the most variant genes has the greatest influence on the direction of that principal component. Genes are given quantitative scores based on how much they influence the various PCs. The product of the influence and the normalized read counts for each gene is multiplied by all genes to get a ‘per sample’ PC value.We usually plot these per-sample PC values for PCA.

  • The gene expression profiles of samples that cluster together are more similar than those of samples that cluster apart, especially for the most variant genes. As we hope to see replicated clusters together and conditions to separate on PC1, this is a good method to explore the quality of the data.

  • This method can also be used to identify sample outliers and major sources of variation.

plotPCA(vsd, intgroup = 'con')

For the PCA plot generated in the previous part, the output regarding the quality of the samples shows that The biological replicates tend to cluster together. The samples separate by condition on PC1.



DE analysis

fitting the raw counts to the DESeq2 model.
  • Estimated size factors for each gene, as well as the variation in expression across replicates were evaluated. The data can then be fitted to the negative binomial model using these calculations.

  • A DESeq2 object containing the raw data, metadata, and design formula has already been created. The design formula tells DESeq2 which known major source of variation to control, for, or regress out, as well as which condition of interest to use for differential expression testing.

#run the analysis
dds <- DESeq(dds)
using pre-existing size factors
estimating dispersions
gene-wise dispersion estimates
mean-dispersion relationship
final dispersion estimates
fitting model and testing

The final DESeq2 object will have all of the data required to perform differential expression analysis between different sample groups.

resultsNames(dds)
[1] "Intercept"                  "con_normal_vs_metastasis"  
[3] "con_radial_vs_metastasis"   "con_vertical_vs_metastasis"

???????????

set factor level

dds$con <- relevel(dds$con, ref = "normal")
#run the analysis again
dds <- DESeq(dds)
using pre-existing size factors
estimating dispersions
found already estimated dispersions, replacing these
gene-wise dispersion estimates
mean-dispersion relationship
final dispersion estimates
fitting model and testing
resultsNames(dds)
[1] "Intercept"                "con_metastasis_vs_normal"
[3] "con_radial_vs_normal"     "con_vertical_vs_normal"  

???????????

Exploring the results

  • The raw counts are modeled using the dispersion estimates; we should investigate how the raw data will match the model.

  • The purpose of DE analysis is to see if the mean expression of a gene differs between sample groups when there is variation within groups. This is accomplished by determining whether the difference in log2 fold changes between groups is significantly different from zero.

  • The log2 fold changes are calculated by dividing the mean of one sample group by the mean of the other sample group. As a result, information regarding the mean and variation in the data is required to model the counts.



Plotting Dispersion

  • We’ll look at the variance in gene expression relative to the mean to see how our data varies.

  • Variance is the square of the standard deviation, representing how far away the expression of the individual samples, are from the means.

*The variance is expected to increase with the gene’s mean expression in RNA-Seq data. To observe this relationship, we can use the apply() function to calculate the means and variances for each gene in the normal samples. Then, using ggplot2, we create a data frame for plotting. The log10 scales can be used to plot the mean and variance for each gene.

  • we can plot the mean and variance for each gene using the log10 scales.

  • In the DESeq2 model, a metric called dispersion describes a measure of variance for a given mean to assess the variability in expression.

  • Dispersion formula: Var = μ + α*μ^2 .

plotDispEsts(dds)

  • Each black dot represents a gene, with associated mean and dispersion values. With RNA seq data, the variance of gene expression increases as the mean decreases, which is to be expected. Also, notice how the variance range for lower mean counts is wider than for higher mean counts.

  • As the mean increases, the dispersion values decrease. The increase in variance, on the other hand, increases dispersion.

  • Because gene-wise estimates of dispersion are often misleading in RNA-seq experiments with only a few replicates, DESeq2 uses information from all genes to determine the most likely estimates of dispersion for a given mean expression value, as shown by the red line in the figure.

  • Genes with inaccurately tiny estimates of variation could return many false postives, or genes classified as DE, when they are really not. Therefore, the original gene-wise dispersion estimates, shown as the black dots in the figure, are shrunken towards the curve to yield more accurate estimates of dispersion.

  • For identifying the differentially expressed genes, the more accurate, shrunken dispersion estimates are applied to model the counts.

*Extremely high dispersion values, encircled by blue circles, aren’t shrunk since there’s a chance the gene has more variability than others for biological or technical reasons, and lowering the variability could lead to false positives.

  • The strength of the shrinkage is determined by the sample size and distance from the curve. A large number of replicates allows for more accurate estimation of the mean and variation, resulting in less shrinkage. In the figure, The dispersions decrease with increasing mean and cluster around the maximum likelihood (ML) line, indicating a satisfactory match.



First comparison Normal Vs radial growth phase “RGP”

we will try to find the significant genes during the development of cancer cells from melanocytes to the Radial growth phase.

Extraction of results (normal vs RGP)

  • Now that we have explored the fit of our data to the model, we can extract the DE testing model.

  • We can also get more precise foldchange estimates, which measure the expression of one sample group compared to another. The Wald test is used by DESeq2 to compare two sample groups for the condition of interest, in this case “normal and radial growth Phase.”

#sadad
dds1_res <- results(dds, contrast = c("con", "radial", "normal"))
dds1_res
log2 fold change (MLE): con radial vs normal 
Wald test p-value: con radial vs normal 
DataFrame with 26574 rows and 6 columns
                  baseMean log2FoldChange     lfcSE      stat
                 <numeric>      <numeric> <numeric> <numeric>
ENSG00000223972    1.78799       0.710301  1.992192  0.356542
ENSG00000227232    7.01446       0.536886  0.733066  0.732383
ENSG00000241860    1.98683       1.378421  1.664696  0.828031
ENSG00000279457    5.70124       1.632679  0.796839  2.048945
ENSG00000228463    5.49010      -2.638075  1.296266 -2.035134
...                    ...            ...       ...       ...
ENSG00000198695  399.68416      -1.588101  0.476504 -3.332819
ENSG00000210194    2.94970      -1.894507  1.723122 -1.099462
ENSG00000198727 1411.29417      -1.208026  0.401252 -3.010640
ENSG00000210195    9.21425      -1.048826  0.793127 -1.322394
ENSG00000210196   58.66427      -0.208716  0.492520 -0.423773
                     pvalue      padj
                  <numeric> <numeric>
ENSG00000223972   0.7214345  0.851775
ENSG00000227232   0.4639346  0.663017
ENSG00000241860   0.4076527  0.612446
ENSG00000279457   0.0404674  0.130239
ENSG00000228463   0.0418374  0.133405
...                     ...       ...
ENSG00000198695 0.000859709 0.0067447
ENSG00000210194 0.271566402 0.4737640
ENSG00000198727 0.002606980 0.0164832
ENSG00000210195 0.186036892 0.3711853
ENSG00000210196 0.671731428 0.8178945

log2 fold change : condition radial vs normal, stating that normal is the base level of comparison. Which means that all log2fold changes represent the radial group relative to the normal group.

The MA plot

For all genes analyzed, the MA plot depicts the mean of normalized counts vs log2foldchanges.

plotMA(dds, ylim=c(-20,20))

  • The genes with a significant DE are colored in blue.
  • The large log2 foldchanges, especially for genes with lower mean count values, should be noted. Genes with less information, such as those with a low number of counts or high dispersion values, are unlikely to be as accurate as genes with more information.
  • We can use log2foldchange shrinkage to improve the estimated fold changes.

LFC shrinkage

for genes with low amount of data available, shrinkage utilizes information from all genes to provide more likely, lower, log2 fold change estimates, similar to what we performed with dispersion.

dds1_LFC <- lfcShrink(dds,
                      coef = "con_radial_vs_normal" ,
                      res = dds1_res)
using 'apeglm' for LFC shrinkage. If used in published research, please cite:
    Zhu, A., Ibrahim, J.G., Love, M.I. (2018) Heavy-tailed prior distributions for
    sequence count data: removing the noise and preserving large differences.
    Bioinformatics. https://doi.org/10.1093/bioinformatics/bty895
plotMA(dds1_LFC, ylim=c(-20,20))

  • The log2fold change values are now more restricted, especially for lowly expressed genes.

  • The shrunken log2foldchanges should be more precise; however, shrinking the log2 foldchanges will not affect the number of differentially expressed genes returned, only the log2 fold change values. Now we can retrieve the significant DE genes and perform further visualization of results.



DESeq2 results table

getting the description for the columns in the results table

mcols(dds1_LFC)
DataFrame with 5 rows and 2 columns
                       type            description
                <character>            <character>
baseMean       intermediate mean of normalized c..
log2FoldChange      results log2 fold change (MA..
lfcSE               results posterior SD: con ra..
pvalue              results Wald test p-value: c..
padj                results   BH adjusted p-values



determine siginificant DE gene

  • We will use the p-value adjusted for the multiple test correction. Because, every gene tested with alpha equals to 0.05, meanst that there is a 5% chance that the gene is called as a DE when it is not, returning false positives. For this reason, multiple test correction is performed by DESeq2 using Benjamin-Hochberg, or BH-method, to adjust the p-values for multiple testing and control the proportion of false postives relative to true.

  • If we had 1000 genes categorized as DE and used the BH-method with an alpha value of 0.05, we would expect 5% of the DE genes to be false positives or 50 genes.

  • Prior to testing, DESeq2 automatically filters out genes that are unlikely to be actually differentially expressed, such as genes with zero counts across all samples, genes with low mean values across all samples, and genes with dramatic count outlines, to limit the number of genes tested.

head(dds1_LFC,n =10 )
log2 fold change (MAP): con radial vs normal 
Wald test p-value: con radial vs normal 
DataFrame with 10 rows and 5 columns
                 baseMean log2FoldChange     lfcSE      pvalue
                <numeric>      <numeric> <numeric>   <numeric>
ENSG00000223972   1.78799       0.132969  0.885471 0.721434541
ENSG00000227232   7.01446       0.350785  0.604942 0.463934557
ENSG00000241860   1.98683       0.381369  0.901971 0.407652728
ENSG00000279457   5.70124       1.183497  0.769140 0.040467446
ENSG00000228463   5.49010      -1.436076  1.310678 0.041837426
ENSG00000237094   8.96601       0.562235  0.843891 0.280379678
ENSG00000225972   5.58042       0.564300  0.788241 0.286326887
ENSG00000225630  38.66330      -2.618253  0.785182 0.000084522
ENSG00000237973 141.47226      -0.136626  0.572538 0.763457237
ENSG00000229344  94.37507       0.194176  0.689321 0.692876738
                       padj
                  <numeric>
ENSG00000223972 0.851775139
ENSG00000227232 0.663016611
ENSG00000241860 0.612445520
ENSG00000279457 0.130239006
ENSG00000228463 0.133405074
ENSG00000237094 0.483646137
ENSG00000225972 0.490378302
ENSG00000225630 0.000990717
ENSG00000237973 0.880374602
ENSG00000229344 0.832377164

The filtered genes are denoted by a NA in the p-adjusted column in the results table.



Siginificant DE genes summary

summary(dds1_LFC)

out of 26574 with nonzero total read count
adjusted p-value < 0.1
LFC > 0 (up)       : 3063, 12%
LFC < 0 (down)     : 2972, 11%
outliers [1]       : 7, 0.026%
low counts [2]     : 5152, 19%
(mean count < 2)
[1] see 'cooksCutoff' argument of ?results
[2] see 'independentFiltering' argument of ?results

Over 1000 genes are DE which is the sum of log fold changes less than zero and greater than zero.

summary(dds1_LFC, alpha = 0.05)

out of 26574 with nonzero total read count
adjusted p-value < 0.05
LFC > 0 (up)       : 2308, 8.7%
LFC < 0 (down)     : 2401, 9%
outliers [1]       : 7, 0.026%
low counts [2]     : 5152, 19%
(mean count < 2)
[1] see 'cooksCutoff' argument of ?results
[2] see 'independentFiltering' argument of ?results



Testing for siginifcant genes using both an alpha value threshold and a log2fold change threshold different from 0

A log2fold change threshold could be used to return genes that are most likely to be biologically meaningful. A log2fold change threshold isn’t always the best choice. It can, however, be useful when dealing with huge numbers of DE genes.

#Rerun results function, Using 1.25 foldchange threshold which is 0.32 in the log2 scale 
#dds1_res <- results(dds, 
 #                  contrast = c("con", "radial", "normal"),
  #                  alpha = 0.05,
#                    lfcThreshold = 0.32) 
#head(dds1_res)

While using any log2fold change cut-off raises the risk of losing biologically relevant genes, by using a very small log2 fold change threshold, we are aiming to reduce the risk that the more biologically relevant.

resultsNames(dds)
[1] "Intercept"                "con_metastasis_vs_normal"
[3] "con_radial_vs_normal"     "con_vertical_vs_normal"  
#Reshrink the foldchanges 
dds1_LFC <- lfcShrink(dds,
                      coef = "con_radial_vs_normal",
                      res = dds1_res)
using 'apeglm' for LFC shrinkage. If used in published research, please cite:
    Zhu, A., Ibrahim, J.G., Love, M.I. (2018) Heavy-tailed prior distributions for
    sequence count data: removing the noise and preserving large differences.
    Bioinformatics. https://doi.org/10.1093/bioinformatics/bty895
# Turn results table into a data frame 
dds1_res_df <- data.frame(dds1_LFC)
head (dds1_res_df)
ABCDEFGHIJ0123456789
 
 
baseMean
<dbl>
log2FoldChange
<dbl>
lfcSE
<dbl>
pvalue
<dbl>
padj
<dbl>
ENSG000002239721.7879890.13296910.88547070.721434540.8517751
ENSG000002272327.0144580.35078500.60494230.463934560.6630166
ENSG000002418601.9868260.38136930.90197110.407652730.6124455
ENSG000002794575.7012381.18349720.76913990.040467450.1302390
ENSG000002284635.490095-1.43607591.31067810.041837430.1334051
ENSG000002370948.9660150.56223490.84389070.280379680.4836461

Ordering the genes by padjusted values for GSEA

dds1_GSEA <- dds1_res_df %>% arrange(padj)

save the dds1_res object in a table to use it for GSEA

#write.table(dds1_res, file = "DE_VGP_MET.txt", sep = " ", col.names = NA)
#grcm38

dds1_res_anno <- rownames_to_column(dds1_res_df, var = "ensgene") 
head(dds1_res_anno)
ABCDEFGHIJ0123456789
 
 
ensgene
<chr>
baseMean
<dbl>
log2FoldChange
<dbl>
lfcSE
<dbl>
pvalue
<dbl>
padj
<dbl>
1ENSG000002239721.7879890.13296910.88547070.721434540.8517751
2ENSG000002272327.0144580.35078500.60494230.463934560.6630166
3ENSG000002418601.9868260.38136930.90197110.407652730.6124455
4ENSG000002794575.7012381.18349720.76913990.040467450.1302390
5ENSG000002284635.490095-1.43607591.31067810.041837430.1334051
6ENSG000002370948.9660150.56223490.84389070.280379680.4836461
hm_annotables <- grch38[, c( "ensgene", "symbol", "description")]

dds_test <- merge(dds1_res_anno, hm_annotables, by.x = "ensgene", by.y = "ensgene", all.x = T )
#dds_test <- dds_test%>% dplyr::select(-ends_with(".y"))
head(dds_test) 
ABCDEFGHIJ0123456789
 
 
ensgene
<chr>
baseMean
<dbl>
log2FoldChange
<dbl>
lfcSE
<dbl>
pvalue
<dbl>
padj
<dbl>
symbol
<chr>
1ENSG00000000003571.668600.870412840.48152633.874260e-020.125803292TSPAN6
2ENSG000000004191748.176470.524497590.40181901.519862e-010.325413268DPM1
3ENSG00000000457180.215571.438137430.38182145.129922e-050.000650043SCYL3
4ENSG00000000460534.676200.084983250.29941367.698350e-010.883638824C1orf112
5ENSG0000000097113.816951.251494781.03082124.970123e-020.151470901CFH
6ENSG00000001036444.54055-0.131734870.22965755.518713e-010.734103036FUCA2
NA
head(grch38)
ABCDEFGHIJ0123456789
ensgene
<chr>
entrez
<int>
symbol
<chr>
chr
<chr>
start
<int>
end
<int>
strand
<int>
biotype
<chr>
ENSG000000000037105TSPAN6X100627108100639991-1protein_coding
ENSG0000000000564102TNMDX1005849361005998851protein_coding
ENSG000000004198813DPM1205093486750959140-1protein_coding
ENSG0000000045757147SCYL31169849631169894267-1protein_coding
ENSG0000000046055732C1orf11211696620071698540801protein_coding
ENSG000000009382268FGR12761206427635185-1protein_coding
dds1_res_anno <- left_join(x = dds1_res_anno,
                 y = grch38[ , c("ensgene", "symbol", "description")],
                  by = c("ensgene"))

Remove Duplicates

#non_duplicates <- which(duplicated(ids$symbol) == FALSE)
head(dds1_res_anno)
ABCDEFGHIJ0123456789
 
 
ensgene
<chr>
baseMean
<dbl>
log2FoldChange
<dbl>
lfcSE
<dbl>
pvalue
<dbl>
padj
<dbl>
symbol
<chr>
1ENSG000002239721.7879890.13296910.88547070.72143450.8517751DDX11L1
2ENSG000002239721.7879890.13296910.88547070.72143450.8517751DDX11L1
3ENSG000002239721.7879890.13296910.88547070.72143450.8517751DDX11L1
4ENSG000002239721.7879890.13296910.88547070.72143450.8517751DDX11L1
5ENSG000002239721.7879890.13296910.88547070.72143450.8517751DDX11L1
6ENSG000002272327.0144580.35078500.60494230.46393460.6630166WASH7P
head(dds1_LFC)
log2 fold change (MAP): con radial vs normal 
Wald test p-value: con radial vs normal 
DataFrame with 6 rows and 5 columns
                 baseMean log2FoldChange     lfcSE    pvalue
                <numeric>      <numeric> <numeric> <numeric>
ENSG00000223972   1.78799       0.132969  0.885471 0.7214345
ENSG00000227232   7.01446       0.350785  0.604942 0.4639346
ENSG00000241860   1.98683       0.381369  0.901971 0.4076527
ENSG00000279457   5.70124       1.183497  0.769140 0.0404674
ENSG00000228463   5.49010      -1.436076  1.310678 0.0418374
ENSG00000237094   8.96601       0.562235  0.843891 0.2803797
                     padj
                <numeric>
ENSG00000223972  0.851775
ENSG00000227232  0.663017
ENSG00000241860  0.612446
ENSG00000279457  0.130239
ENSG00000228463  0.133405
ENSG00000237094  0.483646
head(grch38)
ABCDEFGHIJ0123456789
ensgene
<chr>
entrez
<int>
symbol
<chr>
chr
<chr>
start
<int>
end
<int>
strand
<int>
biotype
<chr>
ENSG000000000037105TSPAN6X100627108100639991-1protein_coding
ENSG0000000000564102TNMDX1005849361005998851protein_coding
ENSG000000004198813DPM1205093486750959140-1protein_coding
ENSG0000000045757147SCYL31169849631169894267-1protein_coding
ENSG0000000046055732C1orf11211696620071698540801protein_coding
ENSG000000009382268FGR12761206427635185-1protein_coding



Extracting the siginficant DE genes

dds1_sig <- subset(dds1_res_anno, padj < 0.05)



Ordering the genes by padjusted values

dds1_sig <- dds1_sig %>% arrange(padj)



Exploring the final table

View(dds1_sig)
dds1_first20 <- dds1_sig[1:20,]
View(dds1_first20)
write.csv(dds1_first20, "dds1_first20.csv", row.names = F)



Visualization of the results

Expression heatmap

# Subset normalized counts to significant genes 
sig1_norm_counts <- norm_counts[dds1_sig$ensgene,]

# Choose a color palette from RColorBrewerlibrary(RColorBrewer)
heat_colors <- brewer.pal(6,"YlOrRd")
display.brewer.all()

# Run pheatmap
pheatmap(sig1_norm_counts[1:30,],
         color = heat_colors, 
         cluster_rows =F,
         show_rownames =T,
         annotation = dplyr::select(colData, con),
         scale ="row")



???????????????????????????????????????? #### Rlog Transformation for clustering and heatmaps

rld<- rlogTransformation(dds)
head(assay(rld))
                    NHEM76     NHEM77  Sbcl2_64  Sbcl2_70 WM1366_66
ENSG00000223972 0.59199618 -0.2475137 0.5982058 0.2624646  1.238528
ENSG00000227232 2.65519006  2.5390191 2.9659902 2.8402982  2.883771
ENSG00000241860 0.07606271  0.6071246 0.8101048 0.7889924  1.150871
ENSG00000279457 2.04768637  2.3607590 3.3946150 2.9942330  2.018953
ENSG00000228463 2.76019786  2.9639344 1.1767126 1.6998418  3.235212
ENSG00000237094 2.05828030  1.5864318 2.4208884 2.3893842  3.215220
                WM1366_72 WM3211_65  WM3211_71  WM793_67  WM793_73
ENSG00000223972  1.142079 0.4705563 -0.1237509 0.3304930 0.3015933
ENSG00000227232  2.883422 3.0371637  2.4531576 2.3373055 2.4466286
ENSG00000241860  1.332886 0.6969742  0.6241054 0.8766034 0.8394052
ENSG00000279457  1.577442 1.9135689  2.5849665 2.3133343 2.0960224
ENSG00000228463  2.019581 2.7474891  1.6624998 2.3398594 1.5198851
ENSG00000237094  3.791069 2.9359794  3.3425277 1.6105970 2.2993169
                    WM9_69    WM9_75 WM1158_68 WM1158_74
ENSG00000223972 -0.2332217 1.2210628 0.4700582 0.7969234
ENSG00000227232  2.5157238 3.2636258 2.9828250 2.6826526
ENSG00000241860  1.0332529 0.3052228 1.6796610 0.6579017
ENSG00000279457  2.6768982 2.3263161 2.2865932 2.1127464
ENSG00000228463  2.1095090 2.5062586 1.3374956 0.8738203
ENSG00000237094  3.1422104 3.4776052 3.4220916 3.7207741
hist(assay(rld))

sampleDists <- as.matrix(dist(t(assay(rld))))
plotPCA(rld, intgroup="con")

plotPCA(vsd, intgroup = 'con')

Heatmap rld

we (select) the genes for the heatmap in (order) and this order is selected by (rowMeans) and select the (count) of my deseq data set (normalized) to library size and we want (decreasing) to have info about all upregulated and down regulated genes and set the number to present or to visualize [1:50] then we have to transform my normalized count and we select log2.norm.counts which i will use in the heatmap and then tell the heatmap that the annotation_col is the df (annotation_col=df)

#rowmeans is wrong ????
select1 <- order(rowMeans(counts(dds,normalized=TRUE)),decreasing=TRUE) [1:50]
nt <- normTransform(dds) # defaults to log2(x+1)
log2.norm.counts <- assay(nt)[select1,]
df <- as.data.frame(colData(dds)[,c("group","con")])
pheatmap(log2.norm.counts, cluster_rows=FALSE, show_rownames=TRUE,
cluster_cols=FALSE, annotation_col=df, fontsize_row = 5)

From the heatmap we can Know the expression level of genes: Dark blue indicates very low expressed genes light blue indicates low expressed genes Red indicates very highly expressed genes

Heatmap of regularized log transformed dds counts

df <- as.data.frame(colData(rld)[,c("con","group")])
pheatmap(assay(rld)[select1,], cluster_rows=T, show_rownames=TRUE,
cluster_cols=T, annotation_col=df, fontsize_row =8)

top1_20 <- data.frame(sig1_norm_counts)[1:20,]  %>% rownames_to_column(var ="ensgene")
#key column will be a seperate column
top1_20 <- gather(top1_20, key ="samplename", value ="normalized_counts",2:4)
#head(sig1_norm_counts)

top1_20 <- inner_join(top1_20, rownames_to_column(colData, var ="samplename"),                     by ="samplename")
ggplot(top1_20)+ 
  geom_point(aes(x = ensgene, y = normalized_counts, color = con))+        scale_y_log10()+    
  xlab("Genes")+      
  ylab("Normalized Counts")+       
  ggtitle("Top 20 Significant DE Genes")+  
  theme_bw()+    
  theme(axis.text.x = element_text(angle =45, hjust =1))+  
  theme(plot.title = element_text(hjust =0.5))

Test

#good idea is to make symbol as rownames

#top1_20_symbol <- data.frame(sig1_norm_counts)[1:20,]  %>% rownames_to_column(var ="symbol")
#top1_20_symbol <- gather(top1_20, key ="samplename", value ="normalized_counts",2:4)
#top1_20_symbol <- data.frame(sig1_norm_symbol)[1:60,]  %>% rownames_to_column(var ="symbol")
#top1_20_symbol <- gather(top1_20_symbol, key ="samplename", value ="normalized_counts",2:4)
#top1_20_symbol <- inner_join(top1_20_symbol, rownames_to_column(colData, var ="samplename"),                     by ="samplename")
#head(top1_20_smybol)

# adding symbol to norm counts
#sig1_norm_symbol <- rownames_to_column(dds1_res_df, var = "ensgene")

#pick symbol from gr38
#sig1_norm_symbol <- left_join(x = sig1_norm_symbol,
        #         y = grch38[ , c("ensgene", "symbol")],
         #         by = c("ensgene"))
#
#remove duplicates
#sig1_norm_symbol <- unique(sig1_norm_symbol)

#Order by p-adjusted Value
#sig1_norm_symbol <- sig1_norm_symbol %>% arrange(padj)

#get 1st 50
#sig1_norm_symbol <- sig1_norm_symbol[1:70,]

#change symbol to be rownames insted of ensembl ids
#row.names(sig1_norm_symbol) <- sig1_norm_symbol$symbol

#remove last column 
#sig1_norm_symbol <- sig1_norm_symbol[,-7]

#another try


## Subset normalized counts to significant genes 
sig1_norm_symbol <- norm_counts[dds1_sig$ensgene,]

#apply this to norm counts
sig1_norm_symbol <- rownames_to_column(as.data.frame(sig1_norm_symbol), var = "ensgene")

#pick symbol from gr38
sig1_norm_symbol <- left_join(x =  sig1_norm_symbol,
                 y = grch38[ , c("ensgene", "symbol")],
                  by = c("ensgene"))

#remove duplicates
sig1_norm_symbol <- unique(sig1_norm_symbol)
top1_20_symbol <- data.frame(sig1_norm_symbol)[1:20,]  %>% rownames_to_column(var ="ids")

#remove ids column
top1_20_symbol <- top1_20_symbol[,-1]

#key column will be a seperate column
top1_20_symbol <- gather(top1_20_symbol, key ="samplename", value ="normalized_counts",2:4)

top1_20_symbol <- inner_join(top1_20_symbol, rownames_to_column(colData, var ="samplename"),                     by ="samplename")
ggplot(top1_20_symbol)+ 
  geom_point(aes(x = symbol, y = normalized_counts, color = con))+        scale_y_log10()+    
  xlab("Genes")+      
  ylab("Normalized Counts")+       
  ggtitle("Top 20 Significant DE Genes")+  
  theme_bw()+    
  theme(axis.text.x = element_text(angle =45, hjust =1))+  
  theme(plot.title = element_text(hjust =0.5))

Heatmap

# Run pheatmap
pheatmap(sig1_norm_counts[1:30,],
         color = heat_colors, 
         cluster_rows =F,
         show_rownames =T,
         annotation = dplyr::select(colData, con),
         scale ="row")

head(sig1_norm_counts)
                    NHEM76      NHEM77  Sbcl2_64   Sbcl2_70 WM1366_66
ENSG00000136040 4982.02798 4788.983166  49.38698   53.27192   25.3786
ENSG00000123374 4500.69744 4672.714822 556.25340  526.89255  885.0788
ENSG00000008256 1040.59893 1029.061608  81.44520   95.72298  762.9443
ENSG00000119042   20.55504   11.568989 604.77394  598.47669  453.6426
ENSG00000138771    4.28230    8.098293 997.27050 1432.51513   68.2050
ENSG00000135047 9487.00772 8739.793049 280.72601  357.08832  607.5003
                WM1366_72 WM3211_65 WM3211_71   WM793_67  WM793_73
ENSG00000136040  38.58537  33.78052  35.52636   37.26932  32.04139
ENSG00000123374 948.88514 867.48375 877.84480  844.44438 905.39826
ENSG00000008256 690.59939 906.66916 694.48296  823.84818 789.13377
ENSG00000119042 572.48091 408.06868 500.80702  488.42428 496.18388
ENSG00000138771  92.91987  48.64395  68.76069   58.84630  69.57560
ENSG00000135047 724.46002 664.80063 755.22157 1059.23337 632.58868
                   WM9_69     WM9_75 WM1158_68  WM1158_74
ENSG00000136040   49.8626   49.79399  65.05817   68.08078
ENSG00000123374  724.5856  689.64674 693.95380  852.86652
ENSG00000008256  340.8329  338.59912 267.68726  299.55544
ENSG00000119042  217.1232  243.99054 321.22471  257.46914
ENSG00000138771  134.4397  201.66565 168.06694  196.81535
ENSG00000135047 1449.8023 1175.13812 828.13628 1067.01152
head(sig1_norm_symbol)
ABCDEFGHIJ0123456789
 
 
ensgene
<chr>
NHEM76
<dbl>
NHEM77
<dbl>
Sbcl2_64
<dbl>
Sbcl2_70
<dbl>
WM1366_66
<dbl>
WM1366_72
<dbl>
WM3211_65
<dbl>
1ENSG000001360404982.027984788.98316649.3869853.2719225.378638.5853733.78052
2ENSG000001233744500.697444672.714822556.25340526.89255885.0788948.88514867.48375
3ENSG000000082561040.598931029.06160881.4452095.72298762.9443690.59939906.66916
4ENSG0000011904220.5550411.568989604.77394598.47669453.6426572.48091408.06868
5ENSG000001387714.282308.098293997.270501432.5151368.205092.9198748.64395
6ENSG000001350479487.007728739.793049280.72601357.08832607.5003724.46002664.80063
#remove duplicates
sig1_norm_symbol <- unique(sig1_norm_symbol)

#get 1st 50
sig1_norm_symbol <- sig1_norm_symbol[1:70,]

#change symbol to be rownames instead of ensembl ids
row.names(sig1_norm_symbol) <- sig1_norm_symbol$symbol

#remove first and last column 
sig1_norm_symbol <- sig1_norm_symbol[,c(-1,-16)]
# Run pheatmap
pheatmap(sig1_norm_symbol[1:30,],
         color = heat_colors, 
         cluster_rows =F,
         show_rownames =T,
         annotation = dplyr::select(colData, con),
         scale ="row")

pheatmap(sig1_norm_symbol[1:30,],
         #color = heat_colors, 
         cluster_rows =F,
         show_rownames =T,
         annotation = dplyr::select(colData, con),cluster_cols=T,               fontsize_row = 7,
         scale ="row")

# Run pheatmap
pheatmap(sig1_norm_symbol[1:30,1:4],
         #color = heat_colors, 
         cluster_rows =F,
         cluster_cols = F,
         show_rownames =T,
         annotation = dplyr::select(colData, con),
         fontsize_row = 7,
         scale ="row")



Releveling

Instead of using the revel() function to make multiple comparisons, we will subset the data to make the desired comparison clear.

Secound comparison RGP Vs vertical growth phase “VGP”

we will try to find the significant genes during the development of cancer cells from the Radial growth phase to vertical growth phase.

??????????????????????????????

resultsNames(dds)
[1] "Intercept"                "con_metastasis_vs_normal"
[3] "con_radial_vs_normal"     "con_vertical_vs_normal"  

set factor level

dds$con <- relevel(dds$con, ref = "radial")

DE analysis

Fitting the raw counts to the DESeq2 model.
#run the analysis again
dds2 <- DESeq(dds)
using pre-existing size factors
estimating dispersions
found already estimated dispersions, replacing these
gene-wise dispersion estimates
mean-dispersion relationship
final dispersion estimates
fitting model and testing
resultsNames(dds2)
[1] "Intercept"                "con_normal_vs_radial"    
[3] "con_metastasis_vs_radial" "con_vertical_vs_radial"  



Colors for plots

#mycols <- brewer.pal(11, "Set3")[1:length(unique(group1))]



Checking the design of your DESEeqDataSets

design(dds2)
~con



head(counts(dds2)) #un-normalized
                NHEM76 NHEM77 Sbcl2_64 Sbcl2_70 WM1366_66 WM1366_72
ENSG00000223972      2      0        2        1         3         5
ENSG00000227232      7      9       10        9         5        10
ENSG00000241860      0      2        2        2         2         5
ENSG00000279457      4      9       19       13         2         2
ENSG00000228463     11     20        1        3        10         5
ENSG00000237094      3      2        5        5         7        25
                WM3211_65 WM3211_71 WM793_67 WM793_73 WM9_69 WM9_75
ENSG00000223972         1         0        1        1      0      2
ENSG00000227232         7         4        4        5      8      5
ENSG00000241860         1         1        2        2      4      0
ENSG00000279457         2         6        5        4     12      2
ENSG00000228463         7         2        6        2      7      3
ENSG00000237094         6        11        1        4     16      6
                WM1158_68 WM1158_74
ENSG00000223972         2         2
ENSG00000227232        13         5
ENSG00000241860         9         1
ENSG00000279457         7         3
ENSG00000228463         2         0
ENSG00000237094        20        15



Exploring the results

plotDispEsts(dds2)



Extract the DE testing model.

Extraction of results


dds2_res <- results(dds2, contrast = c("con", "vertical", "radial"))
dds2_res
log2 fold change (MLE): con vertical vs radial 
Wald test p-value: con vertical vs radial 
DataFrame with 26574 rows and 6 columns
                  baseMean log2FoldChange     lfcSE      stat
                 <numeric>      <numeric> <numeric> <numeric>
ENSG00000223972    1.78799       0.615814  1.563997  0.393744
ENSG00000227232    7.01446      -0.362312  0.606026 -0.597848
ENSG00000241860    1.98683       0.411896  1.202772  0.342455
ENSG00000279457    5.70124      -1.849810  0.658504 -2.809110
ENSG00000228463    5.49010       1.898228  1.143957  1.659353
...                    ...            ...       ...       ...
ENSG00000198695  399.68416     -0.1841260  0.391999 -0.469710
ENSG00000210194    2.94970     -0.7581881  1.536414 -0.493479
ENSG00000198727 1411.29417      0.2099735  0.328341  0.639499
ENSG00000210195    9.21425     -1.2067161  0.712404 -1.693864
ENSG00000210196   58.66427      0.0646426  0.409127  0.158001
                    pvalue      padj
                 <numeric> <numeric>
ENSG00000223972 0.69377001 0.8556240
ENSG00000227232 0.54994104 0.7675305
ENSG00000241860 0.73200836 0.8779981
ENSG00000279457 0.00496786 0.0361983
ENSG00000228463 0.09704464 0.2871696
...                    ...       ...
ENSG00000198695  0.6385620  0.823365
ENSG00000210194  0.6216742  0.812410
ENSG00000198727  0.5224986  0.747588
ENSG00000210195  0.0902911  0.274519
ENSG00000210196  0.8744558  0.949233



The MA plot

plotMA(dds2, ylim=c(-20,20))

The genes that are significantly DE are colored blue.



LFC shrinkage
dds2_LFC <- lfcShrink(dds2,
                      coef = "con_vertical_vs_radial" ,
                      res = dds2_res)
using 'apeglm' for LFC shrinkage. If used in published research, please cite:
    Zhu, A., Ibrahim, J.G., Love, M.I. (2018) Heavy-tailed prior distributions for
    sequence count data: removing the noise and preserving large differences.
    Bioinformatics. https://doi.org/10.1093/bioinformatics/bty895
plotMA(dds2_LFC, ylim=c(-20,20))



DESeq2 results table

Getting the description for the columns in the results table

mcols(dds2_LFC)
DataFrame with 5 rows and 2 columns
                       type            description
                <character>            <character>
baseMean       intermediate mean of normalized c..
log2FoldChange      results log2 fold change (MA..
lfcSE               results posterior SD: con ve..
pvalue              results Wald test p-value: c..
padj                results   BH adjusted p-values



Determining the siginificant DE genes,

  • We will use the adjusted p-value for the multiple test correction.

  • The reason for this is that for every gene tested with alpha 0.05, there is a 5% chance that the gene is called as a DE when it is not, yielding false positives.

  • Therefore, multiple test correction is performed by DESeq2 using Benjamin-Hochberg, or BH-method, to adjust the p-values for multiple testing and control the proportion of false postives relative to true.

  • Using BH-method and alpha value of 0.05, if we had 1000 genes identified as DE, we would expect 5% of the DE genes to be false positives, or 50 genes. to reduce the number of genes tested, DESeq2 automatically filters out genes unlikely to be truly differential expressed prior to testing, such as genes with zero counts across all samples, genes with low mean values across all samples, and genes with extreme count outlines.

head(dds2_LFC,n =10 )
log2 fold change (MAP): con vertical vs radial 
Wald test p-value: con vertical vs radial 
DataFrame with 10 rows and 5 columns
                 baseMean log2FoldChange     lfcSE      pvalue
                <numeric>      <numeric> <numeric>   <numeric>
ENSG00000223972   1.78799      0.0870122  0.599502 0.693770012
ENSG00000227232   7.01446     -0.1934853  0.452214 0.549941038
ENSG00000241860   1.98683      0.0919239  0.570040 0.732008359
ENSG00000279457   5.70124     -1.4218850  0.683367 0.004967865
ENSG00000228463   5.49010      0.5511408  0.817487 0.097044645
ENSG00000237094   8.96601      0.4654540  0.647805 0.179601852
ENSG00000225972   5.58042     -0.3012066  0.548882 0.366442044
ENSG00000225630  38.66330     -0.0128316  0.454937 0.968476613
ENSG00000237973 141.47226      0.0609502  0.430149 0.852287098
ENSG00000229344  94.37507      2.0547940  0.933471 0.000812679
                      padj
                 <numeric>
ENSG00000223972 0.85562404
ENSG00000227232 0.76753046
ENSG00000241860 0.87799812
ENSG00000279457 0.03619831
ENSG00000228463 0.28716961
ENSG00000237094 0.41486071
ENSG00000225972 0.62133417
ENSG00000225630 0.99138372
ENSG00000237973 0.93853307
ENSG00000229344 0.00892282

We can see the filtered genes in the results table represented by an NA in the p-adjusted column.



Siginificant DE genes summary

summary(dds2_LFC)

out of 26574 with nonzero total read count
adjusted p-value < 0.1
LFC > 0 (up)       : 2421, 9.1%
LFC < 0 (down)     : 1902, 7.2%
outliers [1]       : 7, 0.026%
low counts [2]     : 5152, 19%
(mean count < 2)
[1] see 'cooksCutoff' argument of ?results
[2] see 'independentFiltering' argument of ?results

Over 4000 genes are DE which is the sum of log fold changes less than zero and greater than zero.



summary(dds2_LFC, alpha = 0.05)

out of 26574 with nonzero total read count
adjusted p-value < 0.05
LFC > 0 (up)       : 1903, 7.2%
LFC < 0 (down)     : 1388, 5.2%
outliers [1]       : 7, 0.026%
low counts [2]     : 5152, 19%
(mean count < 2)
[1] see 'cooksCutoff' argument of ?results
[2] see 'independentFiltering' argument of ?results



Testing for siginifcant genes using both an alpha value threshold and a log2fold change threshold different from 0
#Rerun results function, Using 1.25 foldchange threshold which is 0.32 in the log2 scale 
#remove lfcThreshold use lfc = 1??????
dds2_res <- results(dds2, 
                    contrast = c("con", "vertical", "radial"),
                    alpha = 0.05,
                    #lfcThreshold = 0.32
                    ) 
#Reshrink the foldchanges 
dds2_LFC <- lfcShrink(dds2,
                      coef = "con_vertical_vs_radial" ,
                      res = dds2_res)
using 'apeglm' for LFC shrinkage. If used in published research, please cite:
    Zhu, A., Ibrahim, J.G., Love, M.I. (2018) Heavy-tailed prior distributions for
    sequence count data: removing the noise and preserving large differences.
    Bioinformatics. https://doi.org/10.1093/bioinformatics/bty895



Annotaion of the genes

We will use the “annotables” library to annotate the Ensembl genes.

# Turn results table into a data frame 
dds2_res_df <- data.frame(dds2_LFC)
head (dds2_res_df)
ABCDEFGHIJ0123456789
 
 
baseMean
<dbl>
log2FoldChange
<dbl>
lfcSE
<dbl>
pvalue
<dbl>
padj
<dbl>
ENSG000002239721.7879890.087012210.59950230.693770012NA
ENSG000002272327.014458-0.193485320.45221440.5499410380.76222194
ENSG000002418601.9868260.091923890.57003970.732008359NA
ENSG000002794575.701238-1.421885000.68336690.0049678650.03459684
ENSG000002284635.4900950.551140760.81748690.0970446450.27920595
ENSG000002370948.9660150.465453970.64780510.1796018520.40664269
head(grcm38)
ABCDEFGHIJ0123456789
ensgene
<chr>
entrez
<int>
symbol
<chr>
chr
<chr>
start
<int>
end
<int>
strand
<int>
biotype
<chr>
ENSMUSG0000000000114679Gnai33108014596108053462-1protein_coding
ENSMUSG0000000000354192PbsnX7688150776897229-1protein_coding
ENSMUSG0000000002812544Cdc45161859919718630737-1protein_coding
ENSMUSG00000000031NAH197142129266142131880-1lncRNA
ENSMUSG00000000037107815Scml2X1598655211600412091protein_coding
ENSMUSG0000000004911818Apoh111082341801083052221protein_coding

dds2_res_anno <- rownames_to_column(dds2_res_df, var = "ensgene") 
dds2_res_anno <- left_join(x = dds2_res_anno,
                 y = grch38[, c("ensgene", "symbol", "description")],
                  by = c("ensgene"))
#View(dds2_res_anno)



Extraction of the siginficant DE genes

dds2_sig <- subset(dds2_res_anno, padj < 0.05)



#### ordering the genes by p-adjusted values

dds2_sig <- dds2_sig %>% arrange(padj)
head(dds2_sig)
ABCDEFGHIJ0123456789
 
 
ensgene
<chr>
baseMean
<dbl>
log2FoldChange
<dbl>
lfcSE
<dbl>
pvalue
<dbl>
padj
<dbl>
symbol
<chr>
1ENSG00000164741938.537144.3299800.22787395.834329e-821.189270e-77DLC1
2ENSG00000008256582.941523.1124430.18537741.561472e-641.591452e-60CYTH3
3ENSG000001245751932.32137-10.0116410.67413295.887644e-504.000458e-46H1-3
4ENSG00000138771253.57894-4.1161670.28191256.602015e-493.364387e-45SHROOM3
5ENSG0000016257673.99471-6.8813970.49409429.545988e-453.891708e-41MXRA8
6ENSG00000206538447.019364.6795960.34111582.540234e-448.630023e-41VGLL3
head(dds2_sig)
ABCDEFGHIJ0123456789
 
 
ensgene
<chr>
baseMean
<dbl>
log2FoldChange
<dbl>
lfcSE
<dbl>
pvalue
<dbl>
padj
<dbl>
symbol
<chr>
1ENSG00000164741938.537144.3299800.22787395.834329e-821.189270e-77DLC1
2ENSG00000008256582.941523.1124430.18537741.561472e-641.591452e-60CYTH3
3ENSG000001245751932.32137-10.0116410.67413295.887644e-504.000458e-46H1-3
4ENSG00000138771253.57894-4.1161670.28191256.602015e-493.364387e-45SHROOM3
5ENSG0000016257673.99471-6.8813970.49409429.545988e-453.891708e-41MXRA8
6ENSG00000206538447.019364.6795960.34111582.540234e-448.630023e-41VGLL3
dds2_first20 <- dds2_sig[1:20,]
#View(dds2_first20)
write.csv(dds2_first20, "dds2_first20.csv", row.names = F)



Exploring the final table

#View(dds2_sig)



Visualization of the results

Expression heatmap

# Subset normalized counts to significant genes 
sig_norm_counts2 <- norm_counts[dds2_sig$ensgene,]

# Choose a color palette from RColorBrewerlibrary(RColorBrewer)
heat_colors <- brewer.pal(6,"YlOrRd")
display.brewer.all()

# Run pheatmap
pheatmap(sig_norm_counts2,
         color = heat_colors, 
         cluster_rows =T,
         show_rownames =F,
         annotation = dplyr::select(colData, con),
         scale ="row")



Visualizing results-Expression plot

top2_20 <- data.frame(sig_norm_counts2)[1:20,]  %>% rownames_to_column(var ="symbol")
top2_20 <- gather(top2_20, key ="samplename", value ="normalized_counts",4:9)
top2_20 <- inner_join(top2_20, rownames_to_column(colData, var ="samplename"),                     by ="samplename")
ggplot(top2_20)+ 
  geom_point(aes(x = symbol, y = normalized_counts, color = con))+        scale_y_log10()+    
  xlab("Genes")+      
  ylab("Normalized Counts")+       
  ggtitle("Top 20 Significant DE Genes")+  
  theme_bw()+    
  theme(axis.text.x = element_text(angle =45, hjust =1))+  
  theme(plot.title = element_text(hjust =0.5))
Warning: Transformation introduced infinite values in continuous y-axis


## Subset normalized counts to significant genes 
sig2_norm_symbol <- norm_counts[dds2_sig$ensgene,]

#apply this to norm counts
sig2_norm_symbol <- rownames_to_column(as.data.frame(sig2_norm_symbol), var = "ensgene")

#pick symbol from gr38
sig2_norm_symbol <- left_join(x =  sig2_norm_symbol,
                 y = grch38[ , c("ensgene", "symbol")],
                  by = c("ensgene"))

#remove duplicates
sig2_norm_symbol <- unique(sig2_norm_symbol)
head(sig2_norm_symbol)
ABCDEFGHIJ0123456789
 
 
ensgene
<chr>
NHEM76
<dbl>
NHEM77
<dbl>
Sbcl2_64
<dbl>
Sbcl2_70
<dbl>
WM1366_66
<dbl>
WM1366_72
<dbl>
1ENSG00000164741276.63659257.988463861.5171280.740251568.7149781730.829490
2ENSG000000082561040.598931029.061608081.4452095.72298762.944292690.599392
3ENSG000001245753493.500453077.35118293734.349123652.455784.7584883.149826
4ENSG000001387714.282308.0982926997.270501432.5151368.20499992.919873
5ENSG000001625762.569381.1568989432.35272542.707653.1723263.937283
6ENSG000002065383.425840.578449536.3904139.121561134.1063791306.390411
head(top2_20_symbol)
ABCDEFGHIJ0123456789
 
 
ensgene
<chr>
NHEM76
<dbl>
NHEM77
<dbl>
WM793_67
<dbl>
WM793_73
<dbl>
WM9_69
<dbl>
WM9_75
<dbl>
1ENSG00000164741276.63659257.98846381080.8103481384.188246791.489826963.513674
2ENSG000000082561040.598931029.0616080823.848178789.133775340.832940338.599120
3ENSG000001245753493.500453077.35118294.9038584.5773421789.3729331224.932112
4ENSG000001387714.282308.098292658.84629869.575600134.439660201.665653
5ENSG000001625762.569381.15689891.9615434.5773424.4182054.979399
6ENSG000002065383.425840.5784495940.560004960.32636932.18977839.835191
top2_20_symbol <- data.frame(sig2_norm_symbol)[1:20,]  %>% rownames_to_column(var ="ids")

#remove ids column
top2_20_symbol <- top2_20_symbol[,-1]

#key column will be a seperate column
top2_20_symbol <- gather(top2_20_symbol, key ="samplename", value ="normalized_counts",4:9)

top2_20_symbol <- inner_join(top2_20_symbol, rownames_to_column(colData, var ="samplename"),                     by ="samplename")
ggplot(top2_20_symbol)+ 
  geom_point(aes(x = symbol, y = normalized_counts, color = con))+        scale_y_log10()+    
  xlab("Genes")+      
  ylab("Normalized Counts")+       
  ggtitle("Top 20 Significant DE Genes")+  
  theme_bw()+    
  theme(axis.text.x = element_text(angle =45, hjust =1))+  
  theme(plot.title = element_text(hjust =0.5))
Warning: Transformation introduced infinite values in continuous y-axis

Heatmap

#remove duplicates
sig2_norm_symbol <- unique(sig2_norm_symbol)

#get 1st 50
sig2_norm_symbol <- sig2_norm_symbol[1:70,]

#change symbol to be rownames instead of ensembl ids
row.names(sig2_norm_symbol) <- sig2_norm_symbol$symbol

#remove first and last column 
sig2_norm_symbol <- sig2_norm_symbol[,c(-1,-16)]
# Run pheatmap
pheatmap(sig2_norm_symbol[1:30,],
         color = heat_colors, 
         cluster_rows =F,
         show_rownames =T,
         annotation = dplyr::select(colData, con),
         scale ="row")

pheatmap(sig2_norm_symbol[1:30,],
         #color = heat_colors, 
         cluster_rows =F,
         show_rownames =T,
         annotation = dplyr::select(colData, con),cluster_cols=T,               fontsize_row = 7,
         scale ="row")

# Run pheatmap
pheatmap(sig2_norm_symbol[1:30,3:10],
         #color = heat_colors, 
         cluster_rows =F,
         cluster_cols = F,
         show_rownames =T,
         annotation = dplyr::select(colData, con),
         fontsize_row = 7,
         scale ="row")




Vertical growth phase (VGP) Vs Metastasis “MET”

Now, we will move to the 3nd comparison and perform the analysis again. we will try to find the significant genes during the development from the Vertical growth phase to the Metastasis.

head(counts_data)
ABCDEFGHIJ0123456789
 
 
NHEM76
<int>
NHEM77
<int>
Sbcl2_64
<int>
Sbcl2_70
<int>
WM1366_66
<int>
WM1366_72
<int>
WM3211_65
<int>
WM3211_71
<int>
WM793_67
<int>
ENSG00000223972202135101
ENSG0000022723279109510744
ENSG00000241860022225112
ENSG0000027945749191322265
ENSG00000228463112013105726
ENSG0000023709432557256111



resultsNames(dds)
[1] "Intercept"                "con_metastasis_vs_normal"
[3] "con_radial_vs_normal"     "con_vertical_vs_normal"  

set factor level

dds$con <- relevel(dds$con, ref = "vertical")

DE analysis

Fitting the raw counts to the DESeq2 model.
#run the analysis again
dds3 <- DESeq(dds)
using pre-existing size factors
estimating dispersions
found already estimated dispersions, replacing these
gene-wise dispersion estimates
mean-dispersion relationship
final dispersion estimates
fitting model and testing
resultsNames(dds3)
[1] "Intercept"                  "con_radial_vs_vertical"    
[3] "con_normal_vs_vertical"     "con_metastasis_vs_vertical"



Exploring the results

plotDispEsts(dds3)



Extraction of results

dds3_res <- results(dds3, contrast = c("con", "metastasis", "vertical"))
dds3_res
log2 fold change (MLE): con metastasis vs vertical 
Wald test p-value: con metastasis vs vertical 
DataFrame with 26574 rows and 6 columns
                  baseMean log2FoldChange     lfcSE       stat
                 <numeric>      <numeric> <numeric>  <numeric>
ENSG00000223972    1.78799     -0.0755989  1.217621 -0.0620874
ENSG00000227232    7.01446      0.2697658  0.506884  0.5322040
ENSG00000241860    1.98683      0.3424275  0.916919  0.3734545
ENSG00000279457    5.70124      0.5223273  0.608146  0.8588846
ENSG00000228463    5.49010     -1.0887743  0.840198 -1.2958546
...                    ...            ...       ...        ...
ENSG00000198695  399.68416       1.376354  0.309246    4.45068
ENSG00000210194    2.94970       1.593152  1.220674    1.30514
ENSG00000198727 1411.29417       0.928511  0.259295    3.58091
ENSG00000210195    9.21425       1.086471  0.596912    1.82015
ENSG00000210196   58.66427       1.129573  0.318242    3.54941
                     pvalue        padj
                  <numeric>   <numeric>
ENSG00000223972    0.950493    0.979303
ENSG00000227232    0.594585    0.785553
ENSG00000241860    0.708810    0.858696
ENSG00000279457    0.390404    0.624851
ENSG00000228463    0.195026    0.413021
...                     ...         ...
ENSG00000198695 8.55982e-06 0.000182397
ENSG00000210194 1.91845e-01 0.408995282
ENSG00000198727 3.42404e-04 0.003847627
ENSG00000210195 6.87358e-02 0.211005978
ENSG00000210196 3.86092e-04 0.004182180

log2 fold change : condition MET vs VGP, indicating that VGP is the base level of comparison.



The MA plot

plotMA(dds3, ylim=c(-20,20))

The genes that are significantly DE are colored blue.



LFC shrinkage

dds3_LFC <- lfcShrink(dds3,
                      coef = "con_metastasis_vs_vertical" ,
                      res = dds3_res)
using 'apeglm' for LFC shrinkage. If used in published research, please cite:
    Zhu, A., Ibrahim, J.G., Love, M.I. (2018) Heavy-tailed prior distributions for
    sequence count data: removing the noise and preserving large differences.
    Bioinformatics. https://doi.org/10.1093/bioinformatics/bty895
plotMA(dds3_LFC, ylim=c(-20,20))

Now we see more restricted log2fold change values, especially for lowly expressed genes.



DESeq2 results table

Getting description for the coloumns in the results table

mcols(dds3_LFC)
DataFrame with 5 rows and 2 columns
                       type            description
                <character>            <character>
baseMean       intermediate mean of normalized c..
log2FoldChange      results log2 fold change (MA..
lfcSE               results posterior SD: con me..
pvalue              results Wald test p-value: c..
padj                results   BH adjusted p-values



Determine siginificant DE genes,

head(dds3_LFC,n =10 )
log2 fold change (MAP): con metastasis vs vertical 
Wald test p-value: con metastasis vs vertical 
DataFrame with 10 rows and 5 columns
                 baseMean log2FoldChange     lfcSE      pvalue
                <numeric>      <numeric> <numeric>   <numeric>
ENSG00000223972   1.78799     -0.0114374  0.504345 9.50493e-01
ENSG00000227232   7.01446      0.1474614  0.383395 5.94585e-01
ENSG00000241860   1.98683      0.0959220  0.476030 7.08810e-01
ENSG00000279457   5.70124      0.2493869  0.439049 3.90404e-01
ENSG00000228463   5.49010     -0.3594793  0.570045 1.95026e-01
ENSG00000237094   8.96601      0.2806600  0.451384 3.40359e-01
ENSG00000225972   5.58042      0.8239505  0.656229 3.94584e-02
ENSG00000225630  38.66330      1.7205444  0.516495 8.36129e-05
ENSG00000237973 141.47226      0.4001346  0.393890 1.77995e-01
ENSG00000229344  94.37507     -1.4352689  0.683092 2.22839e-03
                      padj
                 <numeric>
ENSG00000223972 0.97930298
ENSG00000227232 0.78555316
ENSG00000241860 0.85869610
ENSG00000279457 0.62485098
ENSG00000228463 0.41302143
ENSG00000237094 0.57723834
ENSG00000225972 0.14324477
ENSG00000225630 0.00123233
ENSG00000237973 0.39120854
ENSG00000229344 0.01687426



Siginificant DE genes summary

summary(dds3_LFC)

out of 26574 with nonzero total read count
adjusted p-value < 0.1
LFC > 0 (up)       : 2822, 11%
LFC < 0 (down)     : 2283, 8.6%
outliers [1]       : 7, 0.026%
low counts [2]     : 5152, 19%
(mean count < 2)
[1] see 'cooksCutoff' argument of ?results
[2] see 'independentFiltering' argument of ?results

Over 5000 genes are DE.
<br

summary(dds3_LFC, alpha = 0.05)

out of 26574 with nonzero total read count
adjusted p-value < 0.05
LFC > 0 (up)       : 2193, 8.3%
LFC < 0 (down)     : 1759, 6.6%
outliers [1]       : 7, 0.026%
low counts [2]     : 5152, 19%
(mean count < 2)
[1] see 'cooksCutoff' argument of ?results
[2] see 'independentFiltering' argument of ?results



Testing for siginifcant genes using both an alpha value threshold and a log2fold change threshold different from 0
#Rerun results function, Using 1.25 foldchange threshold which is 0.32 in the log2 scale 
dds3_res <- results(dds3, 
                    contrast = c("con", "metastasis", "vertical"),
                    alpha = 0.05,
                 #   lfcThreshold = 0.32
                 ) 
#Reshrink the foldchanges 
dds3_LFC <- lfcShrink(dds3,
                      coef = "con_metastasis_vs_vertical" ,
                      res = dds3_res)
using 'apeglm' for LFC shrinkage. If used in published research, please cite:
    Zhu, A., Ibrahim, J.G., Love, M.I. (2018) Heavy-tailed prior distributions for
    sequence count data: removing the noise and preserving large differences.
    Bioinformatics. https://doi.org/10.1093/bioinformatics/bty895



Annotaion of the genes

# Turn results table into a data frame 
dds3_res_df <- data.frame(dds3_LFC)
head (dds3_res_df)
ABCDEFGHIJ0123456789
 
 
baseMean
<dbl>
log2FoldChange
<dbl>
lfcSE
<dbl>
pvalue
<dbl>
padj
<dbl>
ENSG000002239721.787989-0.011437400.50434500.9504932NA
ENSG000002272327.0144580.147461420.38339460.59458470.7780712
ENSG000002418601.9868260.095921990.47602990.7088102NA
ENSG000002794575.7012380.249386890.43904870.39040420.6150873
ENSG000002284635.490095-0.359479340.57004470.19502560.4031438
ENSG000002370948.9660150.280659980.45138360.34035900.5670516

dds3_res_anno <- rownames_to_column(dds3_res_df, var = "ensgene") 
dds3_res_anno <- left_join(x = dds3_res_anno,
                 y = grch38[, c("ensgene", "symbol", "description")],
                  by = c("ensgene"))
View(dds3_res_anno)



Extraction of the siginficant DE genes

dds3_sig <- subset(dds3_res_anno, padj < 0.05)



Ordering the genes by padjusted values

dds3_sig <- dds3_sig %>% arrange(padj)



Exploring the final table

#View(dds3_sig)
dds3_first20 <- dds3_sig[1:20,]

write.csv(dds3_first20, "dds3_first20.csv", row.names = F)



Visualization of the results

Expression heatmap

# Subset normalized counts to significant genes 
sig_norm_counts3 <- norm_counts[dds3_sig$ensgene,]

# Choose a color palette from RColorBrewerlibrary(RColorBrewer)
heat_colors <- brewer.pal(6,"YlOrRd")
display.brewer.all()

# Run pheatmap
pheatmap(sig_norm_counts3,
         color = heat_colors, 
         cluster_rows =T,
         show_rownames =F,
         annotation = dplyr::select(colData, con),
         scale ="row")

##error NA/NAN vlaues



Removing Nas and runnig pheatmap again

sig_norm_counts2[sig_norm_counts2==0] <- NA
 
# Delete the rows associated with NA.

sig_norm_counts2<-sig_norm_counts2[complete.cases(sig_norm_counts2),]
# Run pheatmap
#pheatmap(sig_norm_counts2,
      #   color = heat_colors, 
       #  cluster_rows =T,
        # show_rownames =F,
         #annotation = select(colData2, con2),
         #scale ="row")



Visualizing results-Expression plot

top3_20 <- data.frame(sig_norm_counts3)[1:20,]  %>% rownames_to_column(var ="ensgene")
top3_20 <- gather(top3_20, key ="samplename", value ="normalized_counts",9:14)
top3_20 <- inner_join(top3_20, rownames_to_column(colData, var ="samplename"),                     by ="samplename")
ggplot(top3_20)+ 
  geom_point(aes(x = ensgene, y = normalized_counts, color = con))+        scale_y_log10()+    
  xlab("Genes")+      
  ylab("Normalized Counts")+       
  ggtitle("Top 20 Significant DE Genes")+  
  theme_bw()+    
  theme(axis.text.x = element_text(angle =45, hjust =1))+  
  theme(plot.title = element_text(hjust =0.5))
Warning: Transformation introduced infinite values in continuous y-axis


## Subset normalized counts to significant genes 
sig3_norm_symbol <- norm_counts[dds3_sig$ensgene,]

#apply this to norm counts
sig3_norm_symbol <- rownames_to_column(as.data.frame(sig3_norm_symbol), var = "ensgene")

#pick symbol from gr38
sig3_norm_symbol <- left_join(x =  sig3_norm_symbol,
                 y = grch38[ , c("ensgene", "symbol")],
                  by = c("ensgene"))

#remove duplicates
sig3_norm_symbol <- unique(sig3_norm_symbol)
head(sig3_norm_symbol)
ABCDEFGHIJ0123456789
 
 
ensgene
<chr>
NHEM76
<dbl>
NHEM77
<dbl>
Sbcl2_64
<dbl>
Sbcl2_70
<dbl>
WM1366_66
<dbl>
WM1366_72
<dbl>
1ENSG000001245753493.500453077.35118293734.3491253652.45578094.7584883.1498262
2ENSG000002065383.425840.578449536.39040939.12156371134.1063791306.3904114
3ENSG00000164692287.77057241.21342922.5993152.49712111.5861630.7874565
4ENSG000001481541949.303021786.83041431711.2156662150.02125855538.8803885631.1017674
5ENSG000001240920.856462.89224740.0000000.83237370.0000001.5749131
6ENSG000001119078.5646012.1474389436.684909485.273865231.72325533.8606315
NA
top3_20_symbol <- data.frame(sig3_norm_symbol)[1:20,]  %>% rownames_to_column(var ="ids")

#remove ids column
top3_20_symbol <- top3_20_symbol[,-1]

#key column will be a seperate column
top3_20_symbol <- gather(top3_20_symbol, key ="samplename", value ="normalized_counts",10:14)

top3_20_symbol <- inner_join(top3_20_symbol, rownames_to_column(colData, var ="samplename"),                     by ="samplename")
ggplot(top3_20_symbol)+ 
  geom_point(aes(x = symbol, y = normalized_counts, color = con))+        scale_y_log10()+    
  xlab("Genes")+      
  ylab("Normalized Counts")+       
  ggtitle("Top 20 Significant DE Genes")+  
  theme_bw()+    
  theme(axis.text.x = element_text(angle =45, hjust =1))+  
  theme(plot.title = element_text(hjust =0.5))
Warning: Transformation introduced infinite values in continuous y-axis

Heatmap

#remove duplicates
sig3_norm_symbol <- unique(sig3_norm_symbol)

#get 1st 50
sig3_norm_symbol <- sig3_norm_symbol[1:70,]

#change symbol to be rownames instead of ensembl ids
row.names(sig3_norm_symbol) <- sig3_norm_symbol$symbol

#remove first and last column 
sig3_norm_symbol <- sig3_norm_symbol[,c(-1,-16)]
# Run pheatmap
pheatmap(sig3_norm_symbol[1:30,],
         color = heat_colors, 
         cluster_rows =F,
         show_rownames =T,
         annotation = dplyr::select(colData, con),
         scale ="row")

pheatmap(sig3_norm_symbol[1:30,],
         #color = heat_colors, 
         cluster_rows =F,
         show_rownames =T,
         annotation = dplyr::select(colData, con),cluster_cols=T,               fontsize_row = 7,
         scale ="row")

# Run pheatmap
pheatmap(sig3_norm_symbol[1:30,5:14],
         #color = heat_colors, 
         cluster_rows =F,
         cluster_cols = F,
         show_rownames =T,
         annotation = dplyr::select(colData, con),
         fontsize_row = 7,
         scale ="row")



Find Specific genes in every phase-comparison

Subset unmatched rows from data frame

Now, we have three tables of DE genes. * The first one represents the DE genes from normal to RGP. * The second one represents the DE genes from RGP to VGP. * The third one represents the DE genes from VGP to MET.

#normal vs RGP
head(dds1_sig) 
ABCDEFGHIJ0123456789
 
 
ensgene
<chr>
baseMean
<dbl>
log2FoldChange
<dbl>
lfcSE
<dbl>
pvalue
<dbl>
padj
<dbl>
symbol
<chr>
1ENSG00000136040736.3605-6.5486980.25563986.743132e-1461.444042e-141PLXNC1
2ENSG000001233741324.7676-3.0720220.13863488.838296e-1109.463605e-106CDK2
3ENSG00000008256582.9415-3.5221280.21522313.488942e-612.490523e-57CYTH3
4ENSG00000119042371.05645.2380480.32713514.032854e-592.159089e-55SATB2
5ENSG00000138771253.57897.5004620.47018081.917069e-578.210805e-54SHROOM3
6ENSG000001350471987.7506-4.7931000.32846172.174088e-497.759684e-46CTSL
#RGP vs VGP
head(dds2_sig) 
ABCDEFGHIJ0123456789
 
 
ensgene
<chr>
baseMean
<dbl>
log2FoldChange
<dbl>
lfcSE
<dbl>
pvalue
<dbl>
padj
<dbl>
symbol
<chr>
1ENSG00000164741938.537144.3299800.22787395.834329e-821.189270e-77DLC1
2ENSG00000008256582.941523.1124430.18537741.561472e-641.591452e-60CYTH3
3ENSG000001245751932.32137-10.0116410.67413295.887644e-504.000458e-46H1-3
4ENSG00000138771253.57894-4.1161670.28191256.602015e-493.364387e-45SHROOM3
5ENSG0000016257673.99471-6.8813970.49409429.545988e-453.891708e-41MXRA8
6ENSG00000206538447.019364.6795960.34111582.540234e-448.630023e-41VGLL3
#VGP vs MET
head(dds3_sig) 
ABCDEFGHIJ0123456789
 
 
ensgene
<chr>
baseMean
<dbl>
log2FoldChange
<dbl>
lfcSE
<dbl>
pvalue
<dbl>
padj
<dbl>
symbol
<chr>
1ENSG000001245751932.32149.8584600.57403761.178796e-662.402857e-62H1-3
2ENSG00000206538447.0194-4.2685330.26318441.909101e-601.945756e-56VGLL3
3ENSG00000164692433.237310.4952470.71215753.038157e-482.064327e-44COL1A2
4ENSG000001481542917.7125-3.1845330.22794139.542799e-464.863010e-42UGCG
5ENSG00000124092168.24229.2752500.65870179.930855e-444.048611e-40CTCFL
6ENSG00000111907179.59583.3278130.24582267.801240e-432.650341e-39TPD52L1





Normal vs RGP

We will use the anti_join() from dplyr package to filter out the specific genes which is related only to the development of cells from normal to RGP. The idea is to subtract from both phases (“RGP vs VGP” table and “VGP vs MET”)

#subtract from "RGP vs VGP" table (dds2_sig) 
dds1_1 <-anti_join(dds1_sig, dds2_sig, by= c("ensgene", "symbol"))
head(dds1_1)
ABCDEFGHIJ0123456789
 
 
ensgene
<chr>
baseMean
<dbl>
log2FoldChange
<dbl>
lfcSE
<dbl>
pvalue
<dbl>
padj
<dbl>
symbol
<chr>
1ENSG00000136040736.3605-6.5486980.25563986.743132e-1461.444042e-141PLXNC1
2ENSG00000119042371.05645.2380480.32713514.032854e-592.159089e-55SATB2
3ENSG00000162409321.86387.3707580.61597681.693040e-332.014248e-30PRKAA2
4ENSG00000142949900.41884.1560740.35247922.516308e-332.836144e-30PTPRF
5ENSG00000142627719.86985.3238070.45998552.713355e-322.641204e-29EPHA2
6ENSG000000393191890.7175-3.1046630.27372398.162937e-316.723434e-28ZFYVE16



now we will subtract the new dataframe from the last phase only

#subtract from "VGP vs MET" table (dds3_sig) 
dds1_1 <-anti_join(dds1_1, dds3_sig, by= c("ensgene", "symbol"))
head(dds1_1)
ABCDEFGHIJ0123456789
 
 
ensgene
<chr>
baseMean
<dbl>
log2FoldChange
<dbl>
lfcSE
<dbl>
pvalue
<dbl>
padj
<dbl>
symbol
<chr>
1ENSG00000162409321.86387.3707580.61597681.693040e-332.014248e-30PRKAA2
2ENSG000000393191890.7175-3.1046630.27372398.162937e-316.723434e-28ZFYVE16
3ENSG00000161544144.1902-6.4920690.57256477.887861e-316.723434e-28CYGB
4ENSG00000104044273.8170-10.3571390.95571274.524479e-273.125540e-24OCA2
5ENSG00000130775101.3481-4.2488850.41263705.927725e-263.733595e-23THEMIS2
6ENSG00000105048365.44645.0736440.49863631.686641e-251.031983e-22TNNT1

So now we only have genes that are specific to Normal vs RGP and are not shared with other developmental phases.

#save the DE genes in a table

write.table(dds1_1,file = "DE_norm_RGP.txt", sep = "\t", row.names = FALSE)



for DAVID analysis

genes1_1 <- as.factor(dds1_1$symbol)
write.table(genes1_1, "genes1_1.txt")
export(genes1_1, "genes1_1.xlsx")

RGP vs VGP

The idea is to subtract from both phases (“normal vs RGP” and “VGP vs MET”)

#subtract from "normal vs RGP"
dds2_2 <-anti_join(dds2_sig, dds1_sig, by= c("ensgene", "symbol"))
View(dds2_2)
#subtract from "VGP vs MET"
dds2_2 <-anti_join(dds2_2, dds3_sig, by= c("ensgene", "symbol"))
View(dds2_2)

reduced from 897 Genes to 602.

#save the DE genes in a table

write.csv(dds2_2,file = "DE_RGP_VGP.csv", row.names = FALSE)



VGP vs MET

We will use the same approach

#subtract from "normal vs RGP(dds1_sig)"
dds3_3 <-anti_join(dds3_sig, dds1_sig, by= c("ensgene", "symbol"))
View(dds3_3)
#subtract from "RGP vs VGP"
dds3_3 <-anti_join(dds3_3, dds2_sig, by= c("ensgene", "symbol"))
View(dds3_3)

Genes are reduced from 1208 to 913 genes, which are specific only for this phase from vertical growth phase to Metastasis.

#save the DE genes in a table
write.table(dds3_3, file = "DE_VGP_MET.txt", sep = " ", col.names = NA)

GO? GSEQ?

Drawing a venndiagram

            venn.diagram(list("list C"=C, "list D"=D), fill = c("yellow","cyan"), cex = 1.5, filename="Lesson-06/Venn_diagram_genes_2.png")

            venn.diagram(list(A = A, C = C, D = D), fill = c("yellow","red","cyan"), cex = 1.5,filename="Lesson-06/Venn_diagram_genes_3.png")

            venn.diagram(list(A = A, B = B, C = C, D = D), fill = c("yellow","red","cyan","forestgreen"), cex = 1.5,filename="Lesson-06/Venn_diagram_genes_4.png")
#install.packages("VennDiagram")
library(VennDiagram)
Loading required package: grid
Loading required package: futile.logger
#venn.diagram(list(A = dds1_omit, C = dds2_omit, D = dds3_omit), fill = c("yellow","red","cyan"), cex = 1.5,filename="Venn_diagram_genes_3.png")
which(is.na(dds1_sig))
 [1] 30517 30892 31433 31472 31494 31935 32022 32480 32550 32717 32745
[12] 35252 35627 36168 36207 36229 36670 36757 37215 37285 37452 37480
sum(is.na(dds1_sig))
[1] 22

delete na values

dds1_omit <- na.omit(dds1_sig)
sum(is.na(dds1_omit))
[1] 0
dds2_omit <- na.omit(dds2_sig)
dds3_omit <- na.omit(dds3_sig)
grid.newpage()                                       
draw.triple.venn(area1 = 10,                         
                 area2 = 10,
                 area3 = 10,
                 n12 = 2,
                 n23 = 2,
                 n13 = 2,
                 n123 = 1,
                 fill = c("pink", "green", "orange"),
                 lty = "blank",
              category = c("normal_Vs_RGP", "RGP_Vs_VGP", "VGP_Vs_MET"))
(polygon[GRID.polygon.1081], polygon[GRID.polygon.1082], polygon[GRID.polygon.1083], polygon[GRID.polygon.1084], polygon[GRID.polygon.1085], polygon[GRID.polygon.1086], text[GRID.text.1087], text[GRID.text.1088], text[GRID.text.1089], text[GRID.text.1090], text[GRID.text.1091], text[GRID.text.1092], text[GRID.text.1093], text[GRID.text.1094], text[GRID.text.1095], text[GRID.text.1096]) 

head(dds1_omit)
ABCDEFGHIJ0123456789
 
 
ensgene
<chr>
baseMean
<dbl>
log2FoldChange
<dbl>
lfcSE
<dbl>
pvalue
<dbl>
padj
<dbl>
symbol
<chr>
1ENSG00000136040736.3605-6.5486980.25563986.743132e-1461.444042e-141PLXNC1
2ENSG000001233741324.7676-3.0720220.13863488.838296e-1109.463605e-106CDK2
3ENSG00000008256582.9415-3.5221280.21522313.488942e-612.490523e-57CYTH3
4ENSG00000119042371.05645.2380480.32713514.032854e-592.159089e-55SATB2
5ENSG00000138771253.57897.5004620.47018081.917069e-578.210805e-54SHROOM3
6ENSG000001350471987.7506-4.7931000.32846172.174088e-497.759684e-46CTSL
dds1_venn <- dds1_omit$ensgene
class(dds1_venn)
[1] "character"
dds2_venn <- dds2_omit$ensgene
class(dds2_venn)
[1] "character"
dds3_venn <- dds3_omit$ensgene
 
venn.diagram(list("normal vs RGP" = dds1_venn, "RGP vs VGP" = dds2_venn, "VGP vs MET" = dds3_venn), fill = c("yellow","red","cyan"), cex = 1.5,filename="Venn_diagram_1.png")
[1] 1

Actually plot the plot

grid.draw(venn.plot)

venn.plot <- draw.pairwise.venn(length(dds1_venn),
                                length(dds2_venn),
                                # Calculate the intersection of the two sets
                                length( intersect(dds1_venn, dds2_venn) ),
                                category = c("Normal vs RGP", "RGP vs VGP"), scaled = F,
                                fill = c("light blue", "pink"), alpha = rep(0.5, 2),
                                cat.pos = c(0, 0)) ;
grid.draw(venn.plot);
grid.newpage();

library(gplots)

Attaching package: ‘gplots’

The following object is masked from ‘package:IRanges’:

    space

The following object is masked from ‘package:S4Vectors’:

    space

The following object is masked from ‘package:stats’:

    lowess
# Create a Venn-diagram given just the list of gene-names for both sets
#venn(list("IPSC Trisomic vs Disomic" = ipsc.degs,
#          "NEURON Trisomic vs Disomic" = neur.degs), )
library(gplots)
venn(list("noramal vs RGP" = dds1_venn,
          "RGP vs VGP" = dds2_venn, 
     "VGP vs MET"= dds3_venn))

venn(list("RGP vs VGP" = dds2_venn, 
     "VGP vs MET"= dds3_venn), )

Functional annotation

Now, we will use clusterProfiler to perform: * Over representation analysis * Gene Set Enrichment Analysis GSEA

library(org.Hs.eg.db)
Loading required package: AnnotationDbi

Attaching package: ‘AnnotationDbi’

The following object is masked from ‘package:dplyr’:

    select
library(DOSE)
DOSE v3.20.1  For help: https://yulab-smu.top/biomedical-knowledge-mining-book/

If you use DOSE in published research, please cite:
Guangchuang Yu, Li-Gen Wang, Guang-Rong Yan, Qing-Yu He. DOSE: an R/Bioconductor package for Disease Ontology Semantic and Enrichment analysis. Bioinformatics 2015, 31(4):608-609
library(pathview)
##############################################################################
Pathview is an open source software package distributed under GNU General
Public License version 3 (GPLv3). Details of GPLv3 is available at
http://www.gnu.org/licenses/gpl-3.0.html. Particullary, users are required to
formally cite the original Pathview paper (not just mention it) in publications
or products. For details, do citation("pathview") within R.

The pathview downloads and uses KEGG data. Non-academic uses may require a KEGG
license agreement (details at http://www.kegg.jp/kegg/legal.html).
##############################################################################
library(clusterProfiler)
clusterProfiler v4.2.2  For help: https://yulab-smu.top/biomedical-knowledge-mining-book/

If you use clusterProfiler in published research, please cite:
T Wu, E Hu, S Xu, M Chen, P Guo, Z Dai, T Feng, L Zhou, W Tang, L Zhan, X Fu, S Liu, X Bo, and G Yu. clusterProfiler 4.0: A universal enrichment tool for interpreting omics data. The Innovation. 2021, 2(3):100141

Attaching package: ‘clusterProfiler’

The following object is masked from ‘package:AnnotationDbi’:

    select

The following object is masked from ‘package:purrr’:

    simplify

The following object is masked from ‘package:IRanges’:

    slice

The following object is masked from ‘package:S4Vectors’:

    rename

The following object is masked from ‘package:stats’:

    filter
library(AnnotationHub)
Loading required package: BiocFileCache
Loading required package: dbplyr

Attaching package: ‘dbplyr’

The following objects are masked from ‘package:dplyr’:

    ident, sql


Attaching package: ‘AnnotationHub’

The following object is masked from ‘package:Biobase’:

    cache
library(ensembldb)
Loading required package: GenomicFeatures
Loading required package: AnnotationFilter

Attaching package: 'ensembldb'

The following object is masked from 'package:clusterProfiler':

    filter

The following object is masked from 'package:dplyr':

    filter

The following object is masked from 'package:openxlsx':

    addFilter

The following object is masked from 'package:stats':

    filter
library(enrichplot)
library(ggnewscale)
library(tidyverse)

To perform the over-representation analysis, we need a list of background genes and a list of significant genes. For our background dataset we will use all genes tested for differential expression (all genes in our results table). For our significant gene list we will use genes with p-adjusted values less than 0.05 (we could include a fold change threshold too if we have many DE genes).

## Create background dataset for hypergeometric testing using all genes tested for significance in the results                 
allOE_genes <- as.character(dds1_res_anno$ensgene)

## Extract significant results
sigOE <- dplyr::filter(dds1_res_anno, padj < 0.05)

sigOE_genes <- as.character(sigOE$ensgene) #genes vector

Now we can perform the GO enrichment analysis and save the results:

## Run GO enrichment analysis 
ego <- enrichGO(gene = sigOE_genes, 
                universe = allOE_genes,
                keyType = "ENSEMBL",
                OrgDb = org.Hs.eg.db, 
                ont = "BP", 
                pAdjustMethod = "BH", 
                qvalueCutoff = 0.05, 
                readable = TRUE)
                
## Output results from GO analysis to a table
cluster_summary <- data.frame(ego)

write.csv(cluster_summary, "GO1_NorRad.csv")
Visualizing clusterProfiler results

clusterProfiler has a variety of options for viewing the over-represented GO terms. We will explore the dotplot, enrichment plot, and the category netplot.

The dotplot shows the number of genes associated with the first 50 terms (size) and the p-adjusted values for these terms (color). This plot displays the top 50 genes by gene ratio (# genes related to GO term / total number of sig genes), not p-adjusted value.

Dotplot

dotplot(ego, showCategory=25, title = "GO Biological process enrichment",label_format = 55)

barplot(ego, showCategory = 15)

head(ego2)
ABCDEFGHIJ0123456789
 
 
ID
<chr>
Description
<chr>
GeneRatio
<chr>
BgRatio
<chr>
GO:0030335GO:0030335positive regulation of cell migration168/3609464/14366
GO:0040017GO:0040017positive regulation of locomotion174/3609491/14366
GO:2000147GO:2000147positive regulation of cell motility171/3609481/14366
GO:0051272GO:0051272positive regulation of cellular component movement174/3609492/14366
GO:0042330GO:0042330taxis166/3609467/14366
GO:0001667GO:0001667ameboidal-type cell migration138/3609375/14366

The next plot is the enrichment GO plot, which shows the relationship between the top 50 most significantly enriched GO terms (padj.), by grouping similar terms together. The color represents the p-values relative to the other displayed terms (brighter red is more significant) and the size of the terms represents the number of genes that are significant from our list.

a network with edges connecting overlapping gene sets. In this way, mutually overlapping gene sets are tend to cluster together, making it easy to identify functional module.

## Enrichmap clusters the 50 most significant (by padj) GO terms to visualize relationships between terms
#emapplot(ego, showCategory = 10)

##Error in has_pairsim(x)  Please use pairwise_termsim function to deal with the results of enrichment analysis.
x2 <- pairwise_termsim(ego)
emapplot(x2, showCategory = 10, label_format= 40, cex_category=.8)

Finally, the category netplot shows the relationships between the genes associated with the top five most significant GO terms and the fold changes of the significant genes associated with these terms (color). The size of the GO terms reflects the pvalues of the terms, with the more significant terms being larger. This plot is particularly useful for hypothesis generation in identifying genes that may be important to several of the most affected processes.

To color genes by log2 fold changes, we need to extract the log2 fold changes from our results table creating a named vector

OE_foldchanges <- sigOE$log2FoldChange

names(OE_foldchanges) <- sigOE$symbol
## Cnetplot details the genes associated with one or more terms - by default gives the top 5 significant terms (by padj)
cnetplot(ego, 
         categorySize="pvalue", 
         showCategory = 5, 
         foldChange=OE_foldchanges, 
         vertex.label.font=3,
         ggrepel.max.overlaps = Inf,
          )
Warning: ggrepel: 266 unlabeled data points (too many overlaps). Consider increasing max.overlaps

ego2 <- data.frame(ego)
View(ego2)

If you are interested in significant processes that are not among the top five, you can subset your ego dataset to only display these processes:

# find index to subset
which(ego2$ID == "GO:0042438") #melanin biosynthetic process
integer(0)
which(ego2$ID == "GO:0048021") #regulation of melanin biosynthetic 
integer(0)
which(ego2$ID == "GO:0010631") #epithelial cell migration
[1] 15
which(ego2$ID == "GO:0043473") #Pigmentation
[1] 62
which(ego2$ID == "GO:0050673") #epithelial cell proliferation
[1] 34
which(ego2$ID == "GO:0043588") #skin development
[1] 33
## Subsetting the ego results without overwriting original `ego` variable
ego3 <- ego

ego3@result <- ego@result[c(140
,611,14,71,45),]

## Plotting terms of interest
cnetplot(ego3, 
         categorySize="pvalue", 
         foldChange=OE_foldchanges, 
         showCategory = 5, 
         vertex.label.font=6)
Warning: ggrepel: 215 unlabeled data points (too many overlaps). Consider increasing max.overlaps

head(ego3)
ABCDEFGHIJ0123456789
 
 
ID
<chr>
Description
<chr>
GO:0003007GO:0003007heart morphogenesis
GO:0006470GO:0006470protein dephosphorylation
GO:0048284GO:0048284organelle fusion
GO:0090050GO:0090050positive regulation of cell migration involved in sprouting angiogenesis
## convert gene ID to Symbol
edox <- setReadable(ego, 'org.Hs.eg.db', 'ENTREZID')
p1 <- cnetplot(edox, foldChange=OE_foldchanges)

## categorySize can be scaled by 'pvalue' or 'geneNum'
p2 <- cnetplot(edox, categorySize="pvalue", foldChange=OE_foldchanges)
p3 <- cnetplot(edox,showCategory = 3, foldChange=OE_foldchanges, circular = TRUE, colorEdge = TRUE) 
cowplot::plot_grid(p1, p2, p3, ncol=3, labels=LETTERS[1:3], rel_widths=c(.8, .8, 1.2))
Warning: ggrepel: 274 unlabeled data points (too many overlaps). Consider increasing max.overlaps
Warning: ggrepel: 274 unlabeled data points (too many overlaps). Consider increasing max.overlaps
Warning: ggrepel: 173 unlabeled data points (too many overlaps). Consider increasing max.overlaps

cowplot::plot_grid( p3, rel_widths=c(.8, .8, 1.2))
Warning: ggrepel: 138 unlabeled data points (too many overlaps). Consider increasing max.overlaps

Gene set Enrichement analysis using GSEA Java-based

preparing normalized counts for GSEA

#convert norm_counts to df
norm_counts_df <- data.frame(norm_counts)
norm_counts_GSEA <- norm_counts_df %>% mutate(desc= NA, .before="NHEM76")
head(norm_counts_GSEA)
ABCDEFGHIJ0123456789
 
 
desc
<lgl>
NHEM76
<dbl>
NHEM77
<dbl>
Sbcl2_64
<dbl>
Sbcl2_70
<dbl>
WM1366_66
<dbl>
WM1366_72
<dbl>
WM3211_65
<dbl>
ENSG00000223972NA1.712920.0000001.73287660.83237374.7584883.9372831.351221
ENSG00000227232NA5.995225.2060458.66438317.49136337.9308147.8745659.458546
ENSG00000241860NA0.000001.1568991.73287661.66474743.1723263.9372831.351221
ENSG00000279457NA3.425845.20604516.462327910.82085813.1723261.5749132.702442
ENSG00000228463NA9.4210611.5689890.86643832.497121115.8616283.9372839.458546
ENSG00000237094NA2.569381.1568994.33219164.161868511.10313919.6864148.107325
# give the genes coloumn a proper name
norm_counts_GSEA <- cbind(ID  = rownames(norm_counts_GSEA), norm_counts_GSEA)
head(norm_counts_GSEA)
ABCDEFGHIJ0123456789
 
 
ID
<chr>
desc
<lgl>
NHEM76
<dbl>
NHEM77
<dbl>
Sbcl2_64
<dbl>
Sbcl2_70
<dbl>
WM1366_66
<dbl>
WM1366_72
<dbl>
ENSG00000223972ENSG00000223972NA1.712920.0000001.73287660.83237374.7584883.937283
ENSG00000227232ENSG00000227232NA5.995225.2060458.66438317.49136337.9308147.874565
ENSG00000241860ENSG00000241860NA0.000001.1568991.73287661.66474743.1723263.937283
ENSG00000279457ENSG00000279457NA3.425845.20604516.462327910.82085813.1723261.574913
ENSG00000228463ENSG00000228463NA9.4210611.5689890.86643832.497121115.8616283.937283
ENSG00000237094ENSG00000237094NA2.569381.1568994.33219164.161868511.10313919.686414
# remove original rownames
#rownames(norm_counts_GSEA ) <- NULL
head(norm_counts_GSEA)
ABCDEFGHIJ0123456789
 
 
ID
<chr>
desc
<lgl>
NHEM76
<dbl>
NHEM77
<dbl>
Sbcl2_64
<dbl>
Sbcl2_70
<dbl>
WM1366_66
<dbl>
WM1366_72
<dbl>
ENSG00000223972ENSG00000223972NA1.712920.0000001.73287660.83237374.7584883.937283
ENSG00000227232ENSG00000227232NA5.995225.2060458.66438317.49136337.9308147.874565
ENSG00000241860ENSG00000241860NA0.000001.1568991.73287661.66474743.1723263.937283
ENSG00000279457ENSG00000279457NA3.425845.20604516.462327910.82085813.1723261.574913
ENSG00000228463ENSG00000228463NA9.4210611.5689890.86643832.497121115.8616283.937283
ENSG00000237094ENSG00000237094NA2.569381.1568994.33219164.161868511.10313919.686414
#save the normalized counts in a table ready for GSEA Java based program
write.table(norm_counts_GSEA, file = "GSEA/Normal2_GSEA.txt", sep = "\t", row.names = F)
#write.table(norm_counts, file = "GSEA/norm_counts.tsv", sep= "\t", col.names = NA)
export(norm_counts, rowNames = T, "GSEA/norm_counts.xlsx")

preparing normalized counts for GSEA

#convert norm_counts to df
norm_counts_df <- data.frame(norm_counts)
normal_Radial_GSEA <- norm_counts_df %>% dplyr::select(1:4)
head(normal_Radial_GSEA)
ABCDEFGHIJ0123456789
 
 
NHEM76
<dbl>
NHEM77
<dbl>
Sbcl2_64
<dbl>
Sbcl2_70
<dbl>
ENSG000002239721.712920.0000001.73287660.8323737
ENSG000002272325.995225.2060458.66438317.4913633
ENSG000002418600.000001.1568991.73287661.6647474
ENSG000002794573.425845.20604516.462327910.8208581
ENSG000002284639.4210611.5689890.86643832.4971211
ENSG000002370942.569381.1568994.33219164.1618685
normal_Radial_GSEA <- normal_Radial_GSEA %>% mutate(desc= NA, .before="NHEM76")
head(normal_Radial_GSEA)
ABCDEFGHIJ0123456789
 
 
desc
<lgl>
NHEM76
<dbl>
NHEM77
<dbl>
Sbcl2_64
<dbl>
Sbcl2_70
<dbl>
ENSG00000223972NA1.712920.0000001.73287660.8323737
ENSG00000227232NA5.995225.2060458.66438317.4913633
ENSG00000241860NA0.000001.1568991.73287661.6647474
ENSG00000279457NA3.425845.20604516.462327910.8208581
ENSG00000228463NA9.4210611.5689890.86643832.4971211
ENSG00000237094NA2.569381.1568994.33219164.1618685
# give the genes coloumn a proper name
normal_Radial_GSEA <- cbind(ID  = rownames(normal_Radial_GSEA), normal_Radial_GSEA)
head(normal_Radial_GSEA)
ABCDEFGHIJ0123456789
 
 
ID
<chr>
desc
<lgl>
NHEM76
<dbl>
NHEM77
<dbl>
Sbcl2_64
<dbl>
Sbcl2_70
<dbl>
ENSG00000223972ENSG00000223972NA1.712920.0000001.73287660.8323737
ENSG00000227232ENSG00000227232NA5.995225.2060458.66438317.4913633
ENSG00000241860ENSG00000241860NA0.000001.1568991.73287661.6647474
ENSG00000279457ENSG00000279457NA3.425845.20604516.462327910.8208581
ENSG00000228463ENSG00000228463NA9.4210611.5689890.86643832.4971211
ENSG00000237094ENSG00000237094NA2.569381.1568994.33219164.1618685
# remove original rownames
rownames(normal_Radial_GSEA ) <- NULL
head(normal_Radial_GSEA)
ABCDEFGHIJ0123456789
 
 
ID
<chr>
desc
<lgl>
NHEM76
<dbl>
NHEM77
<dbl>
Sbcl2_64
<dbl>
Sbcl2_70
<dbl>
1ENSG00000223972NA1.712920.0000001.73287660.8323737
2ENSG00000227232NA5.995225.2060458.66438317.4913633
3ENSG00000241860NA0.000001.1568991.73287661.6647474
4ENSG00000279457NA3.425845.20604516.462327910.8208581
5ENSG00000228463NA9.4210611.5689890.86643832.4971211
6ENSG00000237094NA2.569381.1568994.33219164.1618685
#save the normalized NormalVSRadial counts in a table
write.table(normal_Radial_GSEA, file = "normal_Rad_GSEA.txt", sep = "\t", row.names = F)
export(normal_Radial_GSEA, "normal_Rad_GSEA.xlsx")
class(normal_Radial_GSEA)
[1] "data.frame"
head(normal_Radial_GSEA)
ABCDEFGHIJ0123456789
 
 
ID
<chr>
desc
<lgl>
NHEM76
<dbl>
NHEM77
<dbl>
Sbcl2_64
<dbl>
Sbcl2_70
<dbl>
1ENSG00000223972NA1.712920.0000001.73287660.8323737
2ENSG00000227232NA5.995225.2060458.66438317.4913633
3ENSG00000241860NA0.000001.1568991.73287661.6647474
4ENSG00000279457NA3.425845.20604516.462327910.8208581
5ENSG00000228463NA9.4210611.5689890.86643832.4971211
6ENSG00000237094NA2.569381.1568994.33219164.1618685
#DAVID1 <- read.table("DAVID/Norm_Rad/chart_AAD5D588820E1657803685183.tsv", sep ="\t", header = T)
#head(DAVID1)
#barplot
#ggplot(DAVID1[1:5], aes(x=Count, y = Term))
#+geom_bar()

GSEA

Steps toward doing gene set enrichment analysis (GSEA):

1- obtaining stats for ranking genes in your experiment, 2- creating a named vector out of the DESeq2 result 3- Obtaining a gene set from mysigbd 4- doing analysis

already we performed DESeq2 analysis and have statistics for working on it

dds1_GSEA<- results(dds, 
                   contrast = c("con", "radial", "normal"),
                    alpha = 0.05)
Error in cleanContrast(object, contrast, expanded = isExpanded, listValues = listValues,  : 
  radial and normal should be levels of con such that con_radial_vs_vertical and con_normal_vs_vertical are contained in 'resultsNames(object)'

Remove Duplicates

creating a named vector [ranked genes]

VISUALIZATION

Over representation analysis using clusterProfiler and MSigDB

Retrieveing Human gene set

using Hallmark :

MSigDb over-presentaton analysis

ORA Hallmarks RGP vs VGP

ORA Hallmarks VGP vs MET

barplot(hm_ora_VGP_MET)

Multiple comparison overtime #cancelled. how the genes behave overtime #cancelled. time dependancy #cancelled . time series #cancelled we have 7 gene sets GO term list based on genes vis of diff exp in each step

#writing #introduction #material and methods
ref bioc hashes with work illustration

---
title: "Diff_expAnalysis"
author: "Mohammed El Belbesy"
output:
  pdf_document: default
  html_notebook: default
---


<br><br>

### Libraries used in the analysis

```{r}
library(DESeq2)
library(RColorBrewer)
library(Biobase)
library(pheatmap)
library(annotables)
library(openxlsx)
library(rio)
library(tidyverse)
```


<br><br>

### Importing feature counts table

```{r}
counts_data<- read.table ("featurecounts.txt", header=T, row.names = 1)
```



```{r}
#head(counts_GSEA)
```

<br><br>

### Data Manipulation 
```{r}
head(counts_data) #view the counts_data table 
```
<br><br>

#### removing the first 5 columns
```{r}
counts_data <- counts_data [ ,6:ncol(counts_data)] 
#head(counts_data)
```

<br><br>

#### Renamaming Columns
```{r}
colnames(counts_data) <- c("NHEM76","NHEM77","Sbcl2_64","Sbcl2_70","WM1158_68","WM1158_74","WM1366_66","WM1366_72","WM3211_65","WM3211_71","WM793_67","WM793_73","WM9_69","WM9_75")
```



```{r}
head(counts_data)
```
<br><br>

#### Reordering coloumns
```{r}
# Getting columns names and positions
colnames(counts_data)
```


```{r}
#Rearranging by columns positions 
counts_data <- counts_data[, c(1, 2, 3, 4, 7, 8, 9, 10, 11, 12, 13, 14, 5, 6)]
```
This step is important, to be able to group them when creating the metadata needed for the DESeq2 object.


```{r}
head(counts_data)
```
<br><br>

#### Pre-Filtering

Filtering out the lowly expressed genes and checking the dimension of the data set. 
We will make it more than 10 because lowly expressed genes are kind of hard to measure for differential expression e.g 1 read or 2 reads.


```{r}
dim(counts_data) #before
counts_data <- counts_data [rowSums(counts_data) > 10, ]
dim(counts_data) #after
```
rows is reduced from 58721 to 26574    
<br> <br>

#### Getting rid off Enseml IDs version

* For genes (i.e. Ensembl identifiers of the form ENSG*), the version number increments when the set of transcripts linked to a gene changes. 

* The "dot digit" representing distinct version numbers associated with stable Ensembl identifiers must be removed from the Ensembl gene id.[Ensembl Documentation stable IDS](<http://asia.ensembl.org/info/genome/stable_ids/index.html>).

* This is important to be able to match them with grcm38 dataset in "annotables" package.

```{r}
tmp=gsub("\\..*","",row.names(counts_data))
```



```{r}
rownames(counts_data) <- tmp
head(counts_data)
```
<br><br>

### DESeq2 analysis Workflow 

* To perform any analysis with DESeq2, we need to create a DESeq2 object by providing the raw counts, metadata, and design formula. 
* DESeq2 object  is a list-like object.

<br><br>

#### Creating a metadata frame
```{r}
group<- factor(c("NHEM","NHEM","Sbcl2","Sbcl2","WM1366","WM1366","WM3211","WM3211","WM793","WM793","WM9","WM9","WM1158","WM1158"))

con<- factor(c(rep("normal",2), rep("radial",2), rep("vertical",6), rep("metastasis",4)))
```
<br> 


colData variable will be our metadata 
```{r}
colData <- data.frame(row.names=colnames(counts_data), group, con)
colData
```


```{r}
#Checking whether the row names in colData matches to column names in counts_data
all(colnames(counts_data) %in% rownames(colData))
```

```{r}
# Ensure that the sample names are in the same order in both datasets,
all(colnames(counts_data) == rownames(colData))
```

If not, we can modify this with match() function.

<br> <br>

#### Colors for plots

```{r}
mycols <- brewer.pal(11, "Set3")[1:length(unique(group))]
```

<br><br>

### Create DESeqDataSet Object 
* As input, the DESeq2 package expects count data as obtained, e.g., from RNA-seq in the form of a matrix of integer values which should be un-normalized counts.
 

* Because the DESeq2 model corrects for library size internally, converted or normalized numbers should not be given as input, such as counts scaled by library size.

```{r}
dds<- DESeqDataSetFromMatrix (countData= counts_data, colData=colData, design= ~ con)
```


```{r}
head(counts(dds))
```

<br><br>

### Checking the design of your DESEeqDataSets

```{r}
design(dds)
```

* We now have a DESeq2 object (dds) containing our raw counts and metadata, and we want to do quality control on our samples. As a result, the normalized counts must be generated (normalized for library size, which is the total number of gene counts per sample, while accounting for library composition).

* To evaluate the sample-level quality control matrix, the first step in the process is to normalize the raw counts.

what is meant by normalized raw counts ? 
<br>

* The raw counts represent the number of reads aligning to each gene and should be proportional to the RNA expression in the sample; however, there are additional factors that can influence the number of genes aligning to each gene besides RNA expression.

* Using normalization methods, we may alter the count data to remove the influence of these factors and remove the overall counts.
<br>

**Library depth**, **gene length**, and **RNA composition** are three of the most important parameters to consider when normalizing count data.

* If the library sizes of two samples differ, many more reads may be mapped to genes in one sample than in the other. 

In terms of **gene length**, a longer gene results in a longer transcript, which results in more fragments for sequencing. As a result, a longer gene with the same amount of expression will have more counts than a shorter gene. Therefore, when comparing the expression levels of various genes, the lengths of the genes must be taken into account.

<br><br>

#### Creating the estimateSizeFactors. 

* When adjusting for **library size**, the library's composition is also crucial to consider. Many normalization methods that are not resistant to outliers can be skewed by a few highly expressed genes.

* Normalization for the majority of genes would be skewed by the highly expressed DE gene if we only divided our counts by the total number of reads.
As a result, we need to use a technique that is resistant to these outlier genes when performing a DE analysis.

* DESeq2 normalizes using the "median of ratios" method. which accounts for library size when computing raw counts and is resistant to large numbers of differentially expressed genes.

* For normalization, the raw counts of each sample will be divided by the sample-specific size factor.

<br><br><br>

#### Estimating size factor
* We will create a data table with read counts normalized by library size 

* Because we have different library sizes for the same number of genes, we need to make a ratio between them, therefore we'll multiply every position by this size factor, which we can achieve by estimating size factors.

```{r}
dds <- estimateSizeFactors(dds)
sizeFactors(dds)# to view the size factors used for normalization 
```

```{r}
colData(dds)
```


```{r}
head(counts(dds)) #un-normalized
```

<br><br>

#### Extract the normalized counts
Now we have the size factors been calculated, and added to the DESeq2 object, the normalized counts can be extracted from it using the counts() function.
```{r}
norm_counts<- counts(dds, normalized = TRUE)
head(norm_counts)
```

```{r}
sum(is.na(norm_counts))
```

#### GSEA save normalized counts for the GSEA
```{r}
gsea_file <- read_delim("ddsNormSF.txt", "\t", escape_double = FALSE, trim_ws = TRUE)
```

```{r}
head(gsea_file)
```
```{r}
write.table(gsea_file, file = "gsea_inputfile.txt", quote = FALSE, row.names = FALSE, sep = "\t")
```


```{r}
#test12 <- read.delim("ddsNormSF2.txt")
#head(test12)
#read_delim("file.txt", "\t", escape_double = FALSE, trim_ws = TRUE)
#write.table(df, file = "df.txt", quote = FALSE, row.names = FALSE, sep = "\t")

```



```{r}
#save the normalized counts in a table
#write.table(norm_counts, file = "C:/Users/melbe/Documents/R/R_Projects/Diff_Exp_thesis/ddsNormSF2.txt", sep = "\t", col.names = NA)

```

Now that we've normalized the counts, we can move on to the next step of our analysis. We can now compare the counts between the different samples because the counts have been normalized for library size.

<br><br> 

#### Quality assessment of our samples

* To measure the quality of our experiment, we can look at how the samples relate in terms of gene expression.

* Visualization approaches for unsupervised clustering analyses, such as hierarchical heatmaps and principal component analysis PCA, are used to do this.

* These QC approaches are used to determine how comparable the biological replicates are to one another, as well as to detect outlier samples and major sources of variation in the data set.

* To better the visualization of the clustering, we should first use the log to transform the normalized counts before using these Visualization methods. DESeq2 applies a variance stabilizing transformation (VST) to RNA-Seq data, which is a logarithmic transformation that reduces variance across the mean.

<br><br>

#### Transform the normalized counts 

The DESeq2 vst() function on the DESeq2 object can be used to transform the normalized counts.
The blind = True argument indicates that the transformation should be blind to the sample information provided in the design formula; this argument should be stated during the quality evaluation.

```{r}
vsd <- vst(dds, blind = TRUE)
```

<br><br>

#### Heatmaps

* To assess the similarity and gene expression between different samples in a dataset, hierarchical clustering with heatmaps is used. 

* This method is used to determine how similar replicates are to one another and whether samples from various sample groups cluster together or separately. The heatmap is made by combining the gene expression correlation values for all pairwise combinations of samples in the data set, with 1 being the perfect correlation.

* The heatmap's colors reflect the correlation values, while the hierarchical tree shows which samples are more similar to one another. The biological replicates should be grouped together, whereas the sample conditions should be separated. Because the majority of genes should not be differentially expressed, samples should have a high correlation.



* Samples having correlation values less than 0.8 may need to be investigated further to see if they are outliers or have contamination.

<br><br>

##### Extract the matrix of transformed counts
To make a heatmap, we'll use the assay() function to extract the VST-transformed normalized counts as a matrix from a vsd object.
The pairwise correlation values between each pair of samples can then be computed using the cor() function.

```{r}
vsd_mat <- assay(vsd) 
```

<br><br>

#### Computing the correlation values between samples
```{r}
vsd_cor <- cor(vsd_mat)
```

<br><br>
 
##### Plot the heatmap
* To create the heatmap, we can use the pheatmap package after generating the correlation values.
* The annotation arguments determine which meta data factors should be used as annotation bars. To select the condition column in colData1, we use the select () function from the dplyr package. The heat map's output shows that the biological replicates are clustered together and the condition is separated.

```{r}
pheatmap(vsd_cor, annotation = dplyr::select(colData, con))
```
The biological replicates are clustered together, while the samples from various conditions are clustered separately. There are no outliers or samples with low correlation values when compared to the rest of the data.

<br><br>

#### Pricipal component analysis PCA
* In order to continue evaluating the quality of our samples, we will use PCA to see how our samples cluster and whether our condition of interest corresponds to the principal components explaining the most variation in the data.


* PCA is a technique for emphasizing the variation in a dataset. The first principal component, or PCA1, represents the greatest amount of variance in the data.

* we could plot the normalized counts of every gene for one sample in the x-axis and the other sample on the y-axis. 
PC2, the dataset's second most variation, must be perpendicular to PC1 in order to best describe the variance in the dataset not included in PC1. PC2 has a much smaller spread.

* The number of principal components in the dataset is equal to the number of samples, n. PC1 means plotting a line through n-dimensional space to find the greatest amount of variation. 

* The principal component with the most variant genes has the greatest influence on the direction of that principal component.
Genes are given quantitative scores based on how much they influence the various PCs.
The product of the influence and the normalized read counts for each gene is multiplied by all genes to get a 'per sample' PC value.We usually plot these per-sample PC values for PCA.


* The gene expression profiles of samples that cluster together are more similar than those of samples that cluster apart, especially for the most variant genes.
As we hope to see replicated clusters together and conditions to separate on PC1, this is a good method to explore the quality of the data.

* This method can also be used to identify sample outliers and major sources of variation.


```{r}
plotPCA(vsd, intgroup = 'con')
```
For the PCA plot generated in the previous part, the output regarding the quality of the samples shows that The biological replicates tend to cluster together. The samples separate by condition on PC1.

<br> <br>


#### DE analysis 
##### fitting the raw counts to the DESeq2 model. 
* Estimated size factors for each gene, as well as the variation in expression across replicates were evaluated. The data can then be fitted to the negative binomial model using these calculations.

* A DESeq2 object containing the raw data, metadata, and *design* formula has already been created. The design formula tells DESeq2 which known major source of variation to control, for, or regress out, as well as which condition of interest to use for differential expression testing.


```{r}
#run the analysis
dds <- DESeq(dds)
```

The final DESeq2 object will have all of the data required to perform differential expression analysis between different sample groups.

```{r}
resultsNames(dds)
```

???????????

### set factor level
```{r}
dds$con <- relevel(dds$con, ref = "normal")
```



```{r}
#run the analysis again
dds <- DESeq(dds)
```

```{r}
resultsNames(dds)
```

???????????



#### Exploring the results
* The raw counts are modeled using the dispersion estimates; we should investigate how the raw data will match the model.

* The purpose of DE analysis is to see if the mean expression of a gene differs between sample groups when there is variation within groups.
This is accomplished by determining whether the difference in log2 fold changes between groups is significantly different from zero.

* The log2 fold changes are calculated by dividing the mean of one sample group by the mean of the other sample group. As a result, information regarding the mean and variation in the data is required to model the counts.

<br><br>

#### Plotting Dispersion

* We'll look at the variance in gene expression relative to the mean to see how our data varies.

* Variance is the square of the standard deviation, representing how far away the expression of the individual samples, are from the means. 

*The variance is expected to increase with the gene's mean expression in RNA-Seq data. To observe this relationship, we can use the apply() function to calculate the means and variances for each gene in the normal samples.
Then, using ggplot2, we create a data frame for plotting.
The log10 scales can be used to plot the mean and variance for each gene.

* we can plot the mean and variance for each gene using the log10 scales.


* In the DESeq2 model, a metric called dispersion describes a measure of variance for a given mean to assess the variability in expression.

* Dispersion formula: Var = μ + α*μ^2 . 

```{r}
plotDispEsts(dds)
```
* Each black dot represents a gene, with associated mean and dispersion values. With RNA seq data, the variance of gene expression increases as the mean decreases, which is to be expected. Also, notice how the variance range for lower mean counts is wider than for higher mean counts.

* As the mean increases, the dispersion values decrease. The increase in variance, on the other hand, increases dispersion.

* Because gene-wise estimates of dispersion are often misleading in RNA-seq experiments with only a few replicates, DESeq2 uses information from all genes to determine the most likely estimates of dispersion for a given mean expression value, as shown by the red line in the figure.


* Genes with inaccurately tiny estimates of variation could return many false postives, or genes classified as DE, when they are really not. Therefore, the original gene-wise dispersion estimates, shown as the black dots in the figure, are shrunken towards the curve to yield more accurate estimates of dispersion. 

* For identifying the differentially expressed genes, the more accurate, shrunken dispersion estimates are applied to model the counts.

*Extremely high dispersion values, encircled by blue circles, aren't shrunk since there's a chance the gene has more variability than others for biological or technical reasons, and lowering the variability could lead to false positives. 

* The strength of the shrinkage is determined by the sample size and distance from the curve. A large number of replicates allows for more accurate estimation of the mean and variation, resulting in less shrinkage.
In the figure, The dispersions decrease with increasing mean and cluster around the maximum likelihood (ML) line, indicating a satisfactory match.

<br><br>

#### First comparison Normal Vs radial growth phase "RGP"
we will try to find the significant genes during the development of cancer cells from melanocytes to the Radial growth phase. 

#### Extraction of results (normal vs RGP)
* Now that we have explored the fit of our data to the model, we can extract the DE testing model.

* We can also get more precise foldchange estimates, which measure the expression of one sample group compared to another. The Wald test is used by DESeq2 to compare two sample groups for the condition of interest, in this case "normal and radial growth Phase."


```{r}
#sadad
dds1_res <- results(dds, contrast = c("con", "radial", "normal"))
dds1_res
```



log2 fold change : condition radial vs normal, stating that normal is the base level of comparison. Which means that all log2fold changes represent the radial group relative to the normal group. 


#### The MA plot 
For all genes analyzed, the MA plot depicts the mean of normalized counts vs log2foldchanges.

```{r}
plotMA(dds, ylim=c(-20,20))
```

* The genes with a significant DE are colored in blue. 
* The large log2 foldchanges, especially for genes with lower mean count values, should be noted. Genes with less information, such as those with a low number of counts or high dispersion values, are unlikely to be as accurate as genes with more information.
* We can use log2foldchange shrinkage to improve the estimated fold changes.


#### LFC shrinkage 
for genes with low amount of data available, shrinkage utilizes information from all genes to provide more likely, lower, log2 fold change estimates, similar to what we performed with dispersion.


```{r}
dds1_LFC <- lfcShrink(dds,
                      coef = "con_radial_vs_normal" ,
                      res = dds1_res)
```




```{r}
plotMA(dds1_LFC, ylim=c(-20,20))

```
* The log2fold change values are now more restricted, especially for lowly expressed genes. 

* The shrunken log2foldchanges should be more precise; however, shrinking the log2 foldchanges will not affect the number of differentially expressed genes returned, only the log2 fold change values.
Now we can retrieve the significant DE genes and perform further visualization of results. 

<br><br>

### DESeq2 results table
#### getting the description for the columns in the results table
```{r}
mcols(dds1_LFC)
```

<br><br>

#### determine siginificant DE gene
* We will use the p-value adjusted for the multiple test correction. Because, every gene tested with alpha equals to 0.05, meanst that there is a 5% chance that the gene is called as a DE when it is not, returning false positives. 
For this reason, multiple test correction is performed by DESeq2 using Benjamin-Hochberg, or BH-method, to adjust the p-values for multiple testing and control the proportion of false postives relative to true.

* If we had 1000 genes categorized as DE and used the BH-method with an alpha value of 0.05, we would expect 5% of the DE genes to be false positives or 50 genes.

* Prior to testing, DESeq2 automatically filters out genes that are unlikely to be actually differentially expressed, such as genes with zero counts across all samples, genes with low mean values across all samples, and genes with dramatic count outlines, to limit the number of genes tested.

```{r}
head(dds1_LFC,n =10 )
```
The filtered genes are denoted by a NA in the p-adjusted column in the results table. 

<br><br>

#### Siginificant DE genes summary 
```{r}
summary(dds1_LFC)
```
Over 1000 genes are DE which is the sum of log fold changes less than zero and greater than zero. 

```{r}
summary(dds1_LFC, alpha = 0.05)
```

<br><br>

##### Testing for siginifcant genes using both an alpha value threshold and a log2fold change threshold different from 0

A log2fold change threshold could be used to return genes that are most likely to be biologically meaningful. A log2fold change threshold isn't always the best choice. It can, however, be useful when dealing with huge numbers of DE genes.

```{r}
#Rerun results function, Using 1.25 foldchange threshold which is 0.32 in the log2 scale 
#dds1_res <- results(dds, 
 #                  contrast = c("con", "radial", "normal"),
  #                  alpha = 0.05,
#                    lfcThreshold = 0.32) 
#head(dds1_res)
```
While using any log2fold change cut-off raises the risk of losing biologically relevant genes, by using a very small log2 fold change threshold, we are aiming to reduce the risk that the more biologically relevant. 

```{r}
resultsNames(dds)
```


```{r}
#Reshrink the foldchanges 
dds1_LFC <- lfcShrink(dds,
                      coef = "con_radial_vs_normal",
                      res = dds1_res)
```


```{r}
# Turn results table into a data frame 
dds1_res_df <- data.frame(dds1_LFC)
head (dds1_res_df)
```



#### Ordering the genes by padjusted values for GSEA
```{r}
dds1_GSEA <- dds1_res_df %>% arrange(padj)
```

#### save the dds1_res object in a table to use it for GSEA
```{r}
#write.table(dds1_res, file = "DE_VGP_MET.txt", sep = " ", col.names = NA)
```

```{r}
#grcm38
```

```{r}

dds1_res_anno <- rownames_to_column(dds1_res_df, var = "ensgene") 
head(dds1_res_anno)
```

```{r}
hm_annotables <- grch38[, c( "ensgene", "symbol", "description")]

dds_test <- merge(dds1_res_anno, hm_annotables, by.x = "ensgene", by.y = "ensgene", all.x = T )
```

```{r}
#dds_test <- dds_test%>% dplyr::select(-ends_with(".y"))
```


```{r}
head(dds_test) 

```


```{r}
head(grch38)
```



```{r}
dds1_res_anno <- left_join(x = dds1_res_anno,
                 y = grch38[ , c("ensgene", "symbol", "description")],
                  by = c("ensgene"))
```

#### Remove Duplicates
```{r}
#non_duplicates <- which(duplicated(ids$symbol) == FALSE)

```



```{r}
head(dds1_res_anno)
```

```{r}
head(dds1_LFC)
```


```{r}
head(grch38)
```

<br><br>

#### Extracting the siginficant DE genes
```{r}
dds1_sig <- subset(dds1_res_anno, padj < 0.05)
```

<br><br>

#### Ordering the genes by padjusted values 
```{r}
dds1_sig <- dds1_sig %>% arrange(padj)
```
<br><br> 

#### Exploring the final table 
```{r}
View(dds1_sig)
```

```{r}
dds1_first20 <- dds1_sig[1:20,]
View(dds1_first20)
write.csv(dds1_first20, "dds1_first20.csv", row.names = F)
```


<br><br>

### Visualization of the results

#### Expression heatmap

```{r}
# Subset normalized counts to significant genes 
sig1_norm_counts <- norm_counts[dds1_sig$ensgene,]

# Choose a color palette from RColorBrewerlibrary(RColorBrewer)
heat_colors <- brewer.pal(6,"YlOrRd")
display.brewer.all()

```

```{r}
# Run pheatmap
pheatmap(sig1_norm_counts[1:30,],
         color = heat_colors, 
         cluster_rows =F,
         show_rownames =T,
         annotation = dplyr::select(colData, con),
         scale ="row")
```


<br><br>
????????????????????????????????????????
#### Rlog Transformation for clustering and heatmaps 

```{r}
rld<- rlogTransformation(dds)
head(assay(rld))
```

```{r}
hist(assay(rld))

```
```{r}
sampleDists <- as.matrix(dist(t(assay(rld))))

```


```{r}
plotPCA(rld, intgroup="con")
```


```{r}
plotPCA(vsd, intgroup = 'con')

```

#### Heatmap rld
we (select) the genes for the heatmap in (order) and this order is selected by 
(rowMeans) and select the (count) of my deseq data set (normalized) to library size and we want (decreasing) to have info about all upregulated
and down regulated genes and set the number to present or to visualize [1:50] then we have to transform my normalized count and we select
log2.norm.counts which i will use in the heatmap and then tell the heatmap that the annotation_col is the df (annotation_col=df)
```{r}
#rowmeans is wrong ????
select1 <- order(rowMeans(counts(dds,normalized=TRUE)),decreasing=TRUE) [1:50]
nt <- normTransform(dds) # defaults to log2(x+1)
log2.norm.counts <- assay(nt)[select1,]
df <- as.data.frame(colData(dds)[,c("group","con")])
pheatmap(log2.norm.counts, cluster_rows=FALSE, show_rownames=TRUE,
cluster_cols=FALSE, annotation_col=df, fontsize_row = 5)

```
From the heatmap we can Know the expression level of genes:
Dark blue indicates very low expressed genes light blue indicates low expressed genes Red indicates very highly expressed genes



###############################

#### Heatmap of regularized log transformed dds counts
```{r}
df <- as.data.frame(colData(rld)[,c("con","group")])
pheatmap(assay(rld)[select1,], cluster_rows=T, show_rownames=TRUE,
cluster_cols=T, annotation_col=df, fontsize_row =8)
```

###################


```{r}
top1_20 <- data.frame(sig1_norm_counts)[1:20,]  %>% rownames_to_column(var ="ensgene")
#key column will be a seperate column
top1_20 <- gather(top1_20, key ="samplename", value ="normalized_counts",2:4)
```
  
```{r}
#head(sig1_norm_counts)
```


```{r}

top1_20 <- inner_join(top1_20, rownames_to_column(colData, var ="samplename"),                     by ="samplename")
```


```{r}
ggplot(top1_20)+ 
  geom_point(aes(x = ensgene, y = normalized_counts, color = con))+        scale_y_log10()+    
  xlab("Genes")+      
  ylab("Normalized Counts")+       
  ggtitle("Top 20 Significant DE Genes")+  
  theme_bw()+    
  theme(axis.text.x = element_text(angle =45, hjust =1))+  
  theme(plot.title = element_text(hjust =0.5))

```



#####################################################
# Test
#good idea is to make symbol as rownames
```{r}
#top1_20_symbol <- data.frame(sig1_norm_counts)[1:20,]  %>% rownames_to_column(var ="symbol")
#top1_20_symbol <- gather(top1_20, key ="samplename", value ="normalized_counts",2:4)
```


```{r}
#top1_20_symbol <- data.frame(sig1_norm_symbol)[1:60,]  %>% rownames_to_column(var ="symbol")
#top1_20_symbol <- gather(top1_20_symbol, key ="samplename", value ="normalized_counts",2:4)
```


```{r}
#top1_20_symbol <- inner_join(top1_20_symbol, rownames_to_column(colData, var ="samplename"),                     by ="samplename")
```

```{r}
#head(top1_20_smybol)
```

#############################

```{r}
# adding symbol to norm counts
#sig1_norm_symbol <- rownames_to_column(dds1_res_df, var = "ensgene")

#pick symbol from gr38
#sig1_norm_symbol <- left_join(x = sig1_norm_symbol,
        #         y = grch38[ , c("ensgene", "symbol")],
         #         by = c("ensgene"))
#
#remove duplicates
#sig1_norm_symbol <- unique(sig1_norm_symbol)

#Order by p-adjusted Value
#sig1_norm_symbol <- sig1_norm_symbol %>% arrange(padj)

#get 1st 50
#sig1_norm_symbol <- sig1_norm_symbol[1:70,]

#change symbol to be rownames insted of ensembl ids
#row.names(sig1_norm_symbol) <- sig1_norm_symbol$symbol

#remove last column 
#sig1_norm_symbol <- sig1_norm_symbol[,-7]


```

#another try

```{r}

## Subset normalized counts to significant genes 
sig1_norm_symbol <- norm_counts[dds1_sig$ensgene,]

#apply this to norm counts
sig1_norm_symbol <- rownames_to_column(as.data.frame(sig1_norm_symbol), var = "ensgene")

#pick symbol from gr38
sig1_norm_symbol <- left_join(x =  sig1_norm_symbol,
                 y = grch38[ , c("ensgene", "symbol")],
                  by = c("ensgene"))

#remove duplicates
sig1_norm_symbol <- unique(sig1_norm_symbol)
```

```{r}
top1_20_symbol <- data.frame(sig1_norm_symbol)[1:20,]  %>% rownames_to_column(var ="ids")

#remove ids column
top1_20_symbol <- top1_20_symbol[,-1]

#key column will be a seperate column
top1_20_symbol <- gather(top1_20_symbol, key ="samplename", value ="normalized_counts",2:4)
```
  

```{r}

top1_20_symbol <- inner_join(top1_20_symbol, rownames_to_column(colData, var ="samplename"),                     by ="samplename")
```


```{r}
ggplot(top1_20_symbol)+ 
  geom_point(aes(x = symbol, y = normalized_counts, color = con))+        scale_y_log10()+    
  xlab("Genes")+      
  ylab("Normalized Counts")+       
  ggtitle("Top 20 Significant DE Genes")+  
  theme_bw()+    
  theme(axis.text.x = element_text(angle =45, hjust =1))+  
  theme(plot.title = element_text(hjust =0.5))

```

#### Heatmap
```{r}
# Run pheatmap
pheatmap(sig1_norm_counts[1:30,],
         color = heat_colors, 
         cluster_rows =F,
         show_rownames =T,
         annotation = dplyr::select(colData, con),
         scale ="row")
```



```{r}
head(sig1_norm_counts)
```


```{r}
head(sig1_norm_symbol)
```

```{r}
#remove duplicates
sig1_norm_symbol <- unique(sig1_norm_symbol)

#get 1st 50
sig1_norm_symbol <- sig1_norm_symbol[1:70,]

#change symbol to be rownames instead of ensembl ids
row.names(sig1_norm_symbol) <- sig1_norm_symbol$symbol

#remove first and last column 
sig1_norm_symbol <- sig1_norm_symbol[,c(-1,-16)]
```

```{r}
# Run pheatmap
pheatmap(sig1_norm_symbol[1:30,],
         color = heat_colors, 
         cluster_rows =F,
         show_rownames =T,
         annotation = dplyr::select(colData, con),
         scale ="row")
```

```{r}
pheatmap(sig1_norm_symbol[1:30,],
         #color = heat_colors, 
         cluster_rows =F,
         show_rownames =T,
         annotation = dplyr::select(colData, con),cluster_cols=T,               fontsize_row = 7,
         scale ="row")

```


```{r}
# Run pheatmap
pheatmap(sig1_norm_symbol[1:30,1:4],
         #color = heat_colors, 
         cluster_rows =F,
         cluster_cols = F,
         show_rownames =T,
         annotation = dplyr::select(colData, con),
         fontsize_row = 7,
         scale ="row")
```


#############################################################################

<br><br>

### Releveling 



Instead of using the revel() function to make multiple comparisons, we will subset the data to make the desired comparison clear. 
<br><br>

#### Secound comparison RGP Vs vertical growth phase "VGP"
we will try to find the significant genes during the development of cancer cells from the Radial growth phase to vertical growth phase. 

??????????????????????????????

```{r}
resultsNames(dds)
```

### set factor level
```{r}
dds$con <- relevel(dds$con, ref = "radial")
```

#### DE analysis 
##### Fitting the raw counts to the DESeq2 model.
```{r}
#run the analysis again
dds2 <- DESeq(dds)
```

```{r}
resultsNames(dds2)
```




<br> <br>

#### Colors for plots

```{r}
#mycols <- brewer.pal(11, "Set3")[1:length(unique(group1))]
```


<br><br>

### Checking the design of your DESEeqDataSets

```{r}
design(dds2)
```

<br><br>


```{r}
head(counts(dds2)) #un-normalized
```




<br><br>

#### Exploring the results
 
```{r}
plotDispEsts(dds2)
```

<br><br>

#### Extract the DE testing model.


# Extraction of results 
```{r}

dds2_res <- results(dds2, contrast = c("con", "vertical", "radial"))
dds2_res
```


<br><br>

#### The MA plot 
 
```{r}
plotMA(dds2, ylim=c(-20,20))
```
The genes that are significantly DE are colored blue. 

<br><br>

##### LFC shrinkage 


```{r}
dds2_LFC <- lfcShrink(dds2,
                      coef = "con_vertical_vs_radial" ,
                      res = dds2_res)
```

```{r}
plotMA(dds2_LFC, ylim=c(-20,20))

```

<br><br>

### DESeq2 results table
#### Getting the description for the columns in the results table
```{r}
mcols(dds2_LFC)
```

<br><br>

#### Determining the siginificant DE genes, 
* We will use the adjusted p-value for the multiple test correction.

* The reason for this is that for every gene tested with alpha 0.05, there is a 5% chance that the gene is called as a DE when it is not, yielding false positives.

* Therefore, multiple test correction is performed by DESeq2 using Benjamin-Hochberg, or BH-method, to adjust the p-values for multiple testing and control the proportion of false postives relative to true.

* Using BH-method and alpha value of 0.05, if we had 1000 genes identified as DE, we would expect 5% of the DE genes to be false positives, or 50 genes.
to reduce the number of genes tested, DESeq2 automatically filters out genes unlikely to be truly differential expressed prior to testing, such as genes with zero counts across all samples, genes with low mean values across all samples, and genes with extreme count outlines. 

```{r}
head(dds2_LFC,n =10 )
```
We can see the filtered genes in the results table represented by an NA in the p-adjusted column. 

<br><br>

#### Siginificant DE genes summary 
```{r}
summary(dds2_LFC)
```
Over 4000 genes are DE which is the sum of log fold changes less than zero and greater than zero. 

<br> <br>
```{r}
summary(dds2_LFC, alpha = 0.05)
```

<br><br>

##### Testing for siginifcant genes using both an alpha value threshold and a log2fold change threshold different from 0
```{r}
#Rerun results function, Using 1.25 foldchange threshold which is 0.32 in the log2 scale 
#remove lfcThreshold use lfc = 1??????
dds2_res <- results(dds2, 
                    contrast = c("con", "vertical", "radial"),
                    alpha = 0.05,
                    #lfcThreshold = 0.32
                    ) 
```

 

```{r}
#Reshrink the foldchanges 
dds2_LFC <- lfcShrink(dds2,
                      coef = "con_vertical_vs_radial" ,
                      res = dds2_res)
```

<br><br>

#### Annotaion of the genes
We will use the "annotables" library to annotate the Ensembl genes.
```{r}
# Turn results table into a data frame 
dds2_res_df <- data.frame(dds2_LFC)
head (dds2_res_df)
```

```{r}
head(grcm38)
```

```{r}

dds2_res_anno <- rownames_to_column(dds2_res_df, var = "ensgene") 

```

```{r}
dds2_res_anno <- left_join(x = dds2_res_anno,
                 y = grch38[, c("ensgene", "symbol", "description")],
                  by = c("ensgene"))
```

```{r}
#View(dds2_res_anno)
```

<br><br>

#### Extraction of the siginficant DE genes
```{r}
dds2_sig <- subset(dds2_res_anno, padj < 0.05)
```
<br><br>
#### ordering the genes by p-adjusted values 
```{r}
dds2_sig <- dds2_sig %>% arrange(padj)
head(dds2_sig)
```

```{r}
head(dds2_sig)
```
```{r}
dds2_first20 <- dds2_sig[1:20,]
#View(dds2_first20)
write.csv(dds2_first20, "dds2_first20.csv", row.names = F)
```

<br><br>

#### Exploring the final table 
```{r}
#View(dds2_sig)
```

<br><br>

### Visualization of the results

#### Expression heatmap

```{r}
# Subset normalized counts to significant genes 
sig_norm_counts2 <- norm_counts[dds2_sig$ensgene,]

# Choose a color palette from RColorBrewerlibrary(RColorBrewer)
heat_colors <- brewer.pal(6,"YlOrRd")
display.brewer.all()

```


```{r}
# Run pheatmap
pheatmap(sig_norm_counts2,
         color = heat_colors, 
         cluster_rows =T,
         show_rownames =F,
         annotation = dplyr::select(colData, con),
         scale ="row")

```

<br><br>

#### Visualizing results-Expression plot

```{r}
top2_20 <- data.frame(sig_norm_counts2)[1:20,]  %>% rownames_to_column(var ="symbol")
top2_20 <- gather(top2_20, key ="samplename", value ="normalized_counts",4:9)
```


```{r}
top2_20 <- inner_join(top2_20, rownames_to_column(colData, var ="samplename"),                     by ="samplename")
```


```{r}
ggplot(top2_20)+ 
  geom_point(aes(x = symbol, y = normalized_counts, color = con))+        scale_y_log10()+    
  xlab("Genes")+      
  ylab("Normalized Counts")+       
  ggtitle("Top 20 Significant DE Genes")+  
  theme_bw()+    
  theme(axis.text.x = element_text(angle =45, hjust =1))+  
  theme(plot.title = element_text(hjust =0.5))

```


##########################


```{r}

## Subset normalized counts to significant genes 
sig2_norm_symbol <- norm_counts[dds2_sig$ensgene,]

#apply this to norm counts
sig2_norm_symbol <- rownames_to_column(as.data.frame(sig2_norm_symbol), var = "ensgene")

#pick symbol from gr38
sig2_norm_symbol <- left_join(x =  sig2_norm_symbol,
                 y = grch38[ , c("ensgene", "symbol")],
                  by = c("ensgene"))

#remove duplicates
sig2_norm_symbol <- unique(sig2_norm_symbol)
```

```{r}
head(sig2_norm_symbol)
head(top2_20_symbol)
```


```{r}
top2_20_symbol <- data.frame(sig2_norm_symbol)[1:20,]  %>% rownames_to_column(var ="ids")

#remove ids column
top2_20_symbol <- top2_20_symbol[,-1]

#key column will be a seperate column
top2_20_symbol <- gather(top2_20_symbol, key ="samplename", value ="normalized_counts",4:9)
```
  

```{r}

top2_20_symbol <- inner_join(top2_20_symbol, rownames_to_column(colData, var ="samplename"),                     by ="samplename")
```


```{r}
ggplot(top2_20_symbol)+ 
  geom_point(aes(x = symbol, y = normalized_counts, color = con))+        scale_y_log10()+    
  xlab("Genes")+      
  ylab("Normalized Counts")+       
  ggtitle("Top 20 Significant DE Genes")+  
  theme_bw()+    
  theme(axis.text.x = element_text(angle =45, hjust =1))+  
  theme(plot.title = element_text(hjust =0.5))

```

#### Heatmap



```{r}
#remove duplicates
sig2_norm_symbol <- unique(sig2_norm_symbol)

#get 1st 50
sig2_norm_symbol <- sig2_norm_symbol[1:70,]

#change symbol to be rownames instead of ensembl ids
row.names(sig2_norm_symbol) <- sig2_norm_symbol$symbol

#remove first and last column 
sig2_norm_symbol <- sig2_norm_symbol[,c(-1,-16)]
```

```{r}
# Run pheatmap
pheatmap(sig2_norm_symbol[1:30,],
         color = heat_colors, 
         cluster_rows =F,
         show_rownames =T,
         annotation = dplyr::select(colData, con),
         scale ="row")
```

```{r}
pheatmap(sig2_norm_symbol[1:30,],
         #color = heat_colors, 
         cluster_rows =F,
         show_rownames =T,
         annotation = dplyr::select(colData, con),cluster_cols=T,               fontsize_row = 7,
         scale ="row")

```


```{r}
# Run pheatmap
pheatmap(sig2_norm_symbol[1:30,3:10],
         #color = heat_colors, 
         cluster_rows =F,
         cluster_cols = F,
         show_rownames =T,
         annotation = dplyr::select(colData, con),
         fontsize_row = 7,
         scale ="row")
```



####################


---

<br><br>

#### Vertical growth phase (VGP) Vs Metastasis "MET"

Now, we will move to the 3nd comparison and perform the analysis again. 
we will try to find the significant genes during the development from the Vertical growth phase to the Metastasis.

```{r}
head(counts_data)
```

<br><br>


```{r}
resultsNames(dds)
```

### set factor level
```{r}
dds$con <- relevel(dds$con, ref = "vertical")
```

#### DE analysis 
##### Fitting the raw counts to the DESeq2 model.
```{r}
#run the analysis again
dds3 <- DESeq(dds)
```

```{r}
resultsNames(dds3)
```




<br><br>


#### Exploring the results
 
```{r}
plotDispEsts(dds3)
```


<br><br>

#### Extraction of results 
```{r}
dds3_res <- results(dds3, contrast = c("con", "metastasis", "vertical"))
dds3_res
```

log2 fold change : condition MET vs VGP, indicating that VGP is the base level of comparison.

<br><br>

#### The MA plot 

```{r}
plotMA(dds3, ylim=c(-20,20))
```
The genes that are significantly DE are colored blue. 

<br><br>

#### LFC shrinkage 


```{r}
dds3_LFC <- lfcShrink(dds3,
                      coef = "con_metastasis_vs_vertical" ,
                      res = dds3_res)
```

```{r}
plotMA(dds3_LFC, ylim=c(-20,20))
```
Now we see more restricted log2fold change values, especially for lowly expressed genes.

<br><br>

### DESeq2 results table
#### Getting description for the coloumns in the results table
```{r}
mcols(dds3_LFC)
```

<br><br>

#### Determine siginificant DE genes, 

```{r}
head(dds3_LFC,n =10 )
```

<br><br>

#### Siginificant DE genes summary 
```{r}
summary(dds3_LFC)
```
Over 5000 genes are DE. 
<br> <br

```{r}
summary(dds3_LFC, alpha = 0.05)
```

<br><br>

##### Testing for siginifcant genes using both an alpha value threshold and a log2fold change threshold different from 0
```{r}
#Rerun results function, Using 1.25 foldchange threshold which is 0.32 in the log2 scale 
dds3_res <- results(dds3, 
                    contrast = c("con", "metastasis", "vertical"),
                    alpha = 0.05,
                 #   lfcThreshold = 0.32
                 ) 
```



```{r}
#Reshrink the foldchanges 
dds3_LFC <- lfcShrink(dds3,
                      coef = "con_metastasis_vs_vertical" ,
                      res = dds3_res)
```

<br><br>

#### Annotaion of the genes

```{r}
# Turn results table into a data frame 
dds3_res_df <- data.frame(dds3_LFC)
head (dds3_res_df)
```

```{r}

dds3_res_anno <- rownames_to_column(dds3_res_df, var = "ensgene") 

```

```{r}
dds3_res_anno <- left_join(x = dds3_res_anno,
                 y = grch38[, c("ensgene", "symbol", "description")],
                  by = c("ensgene"))
```

```{r}
View(dds3_res_anno)
```

<br><br>

#### Extraction of the siginficant DE genes
```{r}
dds3_sig <- subset(dds3_res_anno, padj < 0.05)
```

<br><br>

#### Ordering the genes by padjusted values 
```{r}
dds3_sig <- dds3_sig %>% arrange(padj)
```

<br><br>

#### Exploring the final table 
```{r}
#View(dds3_sig)
```

```{r}
dds3_first20 <- dds3_sig[1:20,]

write.csv(dds3_first20, "dds3_first20.csv", row.names = F)
```

<br><br>

### Visualization of the results

#### Expression heatmap

```{r}
# Subset normalized counts to significant genes 
sig_norm_counts3 <- norm_counts[dds3_sig$ensgene,]

# Choose a color palette from RColorBrewerlibrary(RColorBrewer)
heat_colors <- brewer.pal(6,"YlOrRd")
display.brewer.all()

```


```{r}
# Run pheatmap
pheatmap(sig_norm_counts3,
         color = heat_colors, 
         cluster_rows =T,
         show_rownames =F,
         annotation = dplyr::select(colData, con),
         scale ="row")
##error NA/NAN vlaues
```
  

<br><br>

#### Removing Nas and runnig pheatmap again
```{r}
sig_norm_counts2[sig_norm_counts2==0] <- NA
 
# Delete the rows associated with NA.

sig_norm_counts2<-sig_norm_counts2[complete.cases(sig_norm_counts2),]
```

```{r}
# Run pheatmap
#pheatmap(sig_norm_counts2,
      #   color = heat_colors, 
       #  cluster_rows =T,
        # show_rownames =F,
         #annotation = select(colData2, con2),
         #scale ="row")
```

<br><br>

#### Visualizing results-Expression plot

```{r}
top3_20 <- data.frame(sig_norm_counts3)[1:20,]  %>% rownames_to_column(var ="ensgene")
top3_20 <- gather(top3_20, key ="samplename", value ="normalized_counts",9:14)
```


```{r}
top3_20 <- inner_join(top3_20, rownames_to_column(colData, var ="samplename"),                     by ="samplename")
```


```{r}
ggplot(top3_20)+ 
  geom_point(aes(x = ensgene, y = normalized_counts, color = con))+        scale_y_log10()+    
  xlab("Genes")+      
  ylab("Normalized Counts")+       
  ggtitle("Top 20 Significant DE Genes")+  
  theme_bw()+    
  theme(axis.text.x = element_text(angle =45, hjust =1))+  
  theme(plot.title = element_text(hjust =0.5))

```


##########################


```{r}

## Subset normalized counts to significant genes 
sig3_norm_symbol <- norm_counts[dds3_sig$ensgene,]

#apply this to norm counts
sig3_norm_symbol <- rownames_to_column(as.data.frame(sig3_norm_symbol), var = "ensgene")

#pick symbol from gr38
sig3_norm_symbol <- left_join(x =  sig3_norm_symbol,
                 y = grch38[ , c("ensgene", "symbol")],
                  by = c("ensgene"))

#remove duplicates
sig3_norm_symbol <- unique(sig3_norm_symbol)
```

```{r}
head(sig3_norm_symbol)

```


```{r}
top3_20_symbol <- data.frame(sig3_norm_symbol)[1:20,]  %>% rownames_to_column(var ="ids")

#remove ids column
top3_20_symbol <- top3_20_symbol[,-1]

#key column will be a seperate column
top3_20_symbol <- gather(top3_20_symbol, key ="samplename", value ="normalized_counts",10:14)
```
  

```{r}

top3_20_symbol <- inner_join(top3_20_symbol, rownames_to_column(colData, var ="samplename"),                     by ="samplename")
```


```{r}
ggplot(top3_20_symbol)+ 
  geom_point(aes(x = symbol, y = normalized_counts, color = con))+        scale_y_log10()+    
  xlab("Genes")+      
  ylab("Normalized Counts")+       
  ggtitle("Top 20 Significant DE Genes")+  
  theme_bw()+    
  theme(axis.text.x = element_text(angle =45, hjust =1))+  
  theme(plot.title = element_text(hjust =0.5))

```

#### Heatmap



```{r}
#remove duplicates
sig3_norm_symbol <- unique(sig3_norm_symbol)

#get 1st 50
sig3_norm_symbol <- sig3_norm_symbol[1:70,]

#change symbol to be rownames instead of ensembl ids
row.names(sig3_norm_symbol) <- sig3_norm_symbol$symbol

#remove first and last column 
sig3_norm_symbol <- sig3_norm_symbol[,c(-1,-16)]
```

```{r}
# Run pheatmap
pheatmap(sig3_norm_symbol[1:30,],
         color = heat_colors, 
         cluster_rows =F,
         show_rownames =T,
         annotation = dplyr::select(colData, con),
         scale ="row")
```

```{r}
pheatmap(sig3_norm_symbol[1:30,],
         #color = heat_colors, 
         cluster_rows =F,
         show_rownames =T,
         annotation = dplyr::select(colData, con),cluster_cols=T,               fontsize_row = 7,
         scale ="row")

```


```{r}
# Run pheatmap
pheatmap(sig3_norm_symbol[1:30,5:14],
         #color = heat_colors, 
         cluster_rows =F,
         cluster_cols = F,
         show_rownames =T,
         annotation = dplyr::select(colData, con),
         fontsize_row = 7,
         scale ="row")
```



####################



<br><br>

### Find Specific genes in every phase-comparison
#### Subset unmatched rows from data frame  
Now, we have three tables of DE genes. 
* The first one represents the DE genes from normal to RGP.
* The second one represents the DE genes from RGP to VGP.
* The third one represents the DE genes from VGP to MET.

```{r}
#normal vs RGP
head(dds1_sig) 
```


```{r}
#RGP vs VGP
head(dds2_sig) 
```


```{r}
#VGP vs MET
head(dds3_sig) 
```

<br><br><br><br>

#### Normal vs RGP

We will use the anti_join() from dplyr package to filter out the specific genes which is related only to the development of cells from normal to RGP.
The idea is to subtract from both phases ("RGP vs VGP" table and "VGP vs MET")

```{r}
#subtract from "RGP vs VGP" table (dds2_sig) 
dds1_1 <-anti_join(dds1_sig, dds2_sig, by= c("ensgene", "symbol"))
head(dds1_1)
```

<br><br>

#### now we will subtract the new dataframe from the last phase only

```{r}
#subtract from "VGP vs MET" table (dds3_sig) 
dds1_1 <-anti_join(dds1_1, dds3_sig, by= c("ensgene", "symbol"))
head(dds1_1)
```


So now we only have genes that are specific to Normal vs RGP and are not shared with other developmental  phases.
```{r}
#save the DE genes in a table

write.table(dds1_1,file = "DE_norm_RGP.txt", sep = "\t", row.names = FALSE)

```
<br><br>

### for DAVID analysis
```{r}
genes1_1 <- as.factor(dds1_1$symbol)
write.table(genes1_1, "genes1_1.txt")
```

```{r}
export(genes1_1, "genes1_1.xlsx")
```



#### RGP vs VGP
The idea is to subtract from both phases ("normal vs RGP" and "VGP vs MET")

```{r}
#subtract from "normal vs RGP"
dds2_2 <-anti_join(dds2_sig, dds1_sig, by= c("ensgene", "symbol"))
View(dds2_2)
```



```{r}
#subtract from "VGP vs MET"
dds2_2 <-anti_join(dds2_2, dds3_sig, by= c("ensgene", "symbol"))
View(dds2_2)
```

reduced from 897 Genes to 602.

```{r}
#save the DE genes in a table

write.csv(dds2_2,file = "DE_RGP_VGP.csv", row.names = FALSE)

```
<br><br>

#### VGP vs MET
We will use the same approach 
```{r}
#subtract from "normal vs RGP(dds1_sig)"
dds3_3 <-anti_join(dds3_sig, dds1_sig, by= c("ensgene", "symbol"))
View(dds3_3)
```


```{r}
#subtract from "RGP vs VGP"
dds3_3 <-anti_join(dds3_3, dds2_sig, by= c("ensgene", "symbol"))
View(dds3_3)
```

Genes are reduced from 1208 to 913 genes, which are specific only for this phase from vertical growth phase to Metastasis.

```{r}
#save the DE genes in a table
write.table(dds3_3, file = "DE_VGP_MET.txt", sep = " ", col.names = NA)

```

GO? 
GSEQ?


#### Drawing a venndiagram
 				venn.diagram(list("list C"=C, "list D"=D), fill = c("yellow","cyan"), cex = 1.5, filename="Lesson-06/Venn_diagram_genes_2.png")

				venn.diagram(list(A = A, C = C, D = D), fill = c("yellow","red","cyan"), cex = 1.5,filename="Lesson-06/Venn_diagram_genes_3.png")

				venn.diagram(list(A = A, B = B, C = C, D = D), fill = c("yellow","red","cyan","forestgreen"), cex = 1.5,filename="Lesson-06/Venn_diagram_genes_4.png")

				
```{r}
#install.packages("VennDiagram")
library(VennDiagram)
```


```{r}
#venn.diagram(list(A = dds1_omit, C = dds2_omit, D = dds3_omit), fill = c("yellow","red","cyan"), cex = 1.5,filename="Venn_diagram_genes_3.png")

```


```{r}
which(is.na(dds1_sig))
```

```{r}
sum(is.na(dds1_sig))
```
delete na values 
```{r}
dds1_omit <- na.omit(dds1_sig)
```


```{r}
sum(is.na(dds1_omit))
```


```{r}
dds2_omit <- na.omit(dds2_sig)
dds3_omit <- na.omit(dds3_sig)

```


```{r}
grid.newpage()                                       
draw.triple.venn(area1 = 10,                         
                 area2 = 10,
                 area3 = 10,
                 n12 = 2,
                 n23 = 2,
                 n13 = 2,
                 n123 = 1,
                 fill = c("pink", "green", "orange"),
                 lty = "blank",
              category = c("normal_Vs_RGP", "RGP_Vs_VGP", "VGP_Vs_MET"))
```



```{r}
head(dds1_omit)
```

```{r}
dds1_venn <- dds1_omit$ensgene
class(dds1_venn)
```

```{r}
dds2_venn <- dds2_omit$ensgene
class(dds2_venn)
```

```{r}
dds3_venn <- dds3_omit$ensgene
 
```



```{r}
venn.diagram(list("normal vs RGP" = dds1_venn, "RGP vs VGP" = dds2_venn, "VGP vs MET" = dds3_venn), fill = c("yellow","red","cyan"), cex = 1.5,filename="Venn_diagram_1.png")

```

 

# Actually plot the plot
grid.draw(venn.plot)

```{r}
venn.plot <- draw.pairwise.venn(length(dds1_venn),
                                length(dds2_venn),
                                # Calculate the intersection of the two sets
                                length( intersect(dds1_venn, dds2_venn) ),
                                category = c("Normal vs RGP", "RGP vs VGP"), scaled = F,
                                fill = c("light blue", "pink"), alpha = rep(0.5, 2),
                                cat.pos = c(0, 0)) ;
grid.draw(venn.plot);
grid.newpage();
```

```{r}
```


```{r}
library(gplots)
# Create a Venn-diagram given just the list of gene-names for both sets
#venn(list("IPSC Trisomic vs Disomic" = ipsc.degs,
#          "NEURON Trisomic vs Disomic" = neur.degs), )
```


```{r}
library(gplots)
venn(list("noramal vs RGP" = dds1_venn,
          "RGP vs VGP" = dds2_venn, 
     "VGP vs MET"= dds3_venn))
```


```{r}
venn(list("RGP vs VGP" = dds2_venn, 
     "VGP vs MET"= dds3_venn), )
```


#### Functional annotation 
Now, we will use clusterProfiler to perform:
* Over representation analysis
* Gene Set Enrichment Analysis GSEA

```{r}
library(org.Hs.eg.db)
library(DOSE)
library(pathview)
library(clusterProfiler)
library(AnnotationHub)
library(ensembldb)
library(enrichplot)
library(ggnewscale)
library(tidyverse)
```
To perform the over-representation analysis, we need a list of background genes and a list of significant genes. For our background dataset we will use all genes tested for differential expression (all genes in our results table). For our significant gene list we will use genes with p-adjusted values less than 0.05 (we could include a fold change threshold too if we have many DE genes).

```{r}
## Create background dataset for hypergeometric testing using all genes tested for significance in the results                 
allOE_genes <- as.character(dds1_res_anno$ensgene)

## Extract significant results
sigOE <- dplyr::filter(dds1_res_anno, padj < 0.05)

sigOE_genes <- as.character(sigOE$ensgene) #genes vector
```


Now we can perform the GO enrichment analysis and save the results:

```{r}
## Run GO enrichment analysis 
ego <- enrichGO(gene = sigOE_genes, 
                universe = allOE_genes,
                keyType = "ENSEMBL",
                OrgDb = org.Hs.eg.db, 
                ont = "BP", 
                pAdjustMethod = "BH", 
                qvalueCutoff = 0.05, 
                readable = TRUE)
                
## Output results from GO analysis to a table
cluster_summary <- data.frame(ego)

write.csv(cluster_summary, "GO1_NorRad.csv")
```


##### Visualizing clusterProfiler results

clusterProfiler has a variety of options for viewing the over-represented GO terms. We will explore the dotplot, enrichment plot, and the category netplot.

The dotplot shows the number of genes associated with the first 50 terms (size) and the p-adjusted values for these terms (color). This plot displays the top 50 genes by gene ratio (# genes related to GO term / total number of sig genes), not p-adjusted value.

#### Dotplot 
```{r}
dotplot(ego, showCategory=25, title = "GO Biological process enrichment",label_format = 55)

```

```{r}
barplot(ego, showCategory = 15)
```



```{r}
head(ego2)
```



The next plot is the enrichment GO plot, which shows the relationship between the top 50 most significantly enriched GO terms (padj.), by grouping similar terms together. The color represents the p-values relative to the other displayed terms (brighter red is more significant) and the size of the terms represents the number of genes that are significant from our list.


a network with edges connecting overlapping gene sets. In this way, mutually overlapping gene sets are tend to cluster together, making it easy to identify functional module.
```{r}
## Enrichmap clusters the 50 most significant (by padj) GO terms to visualize relationships between terms
#emapplot(ego, showCategory = 10)

##Error in has_pairsim(x)  Please use pairwise_termsim function to deal with the results of enrichment analysis.
```

```{r}
x2 <- pairwise_termsim(ego)
emapplot(x2, showCategory = 10, label_format= 40, cex_category=.8)
```


Finally, the category netplot shows the relationships between the genes associated with the top five most significant GO terms and the fold changes of the significant genes associated with these terms (color). The size of the GO terms reflects the pvalues of the terms, with the more significant terms being larger. 
This plot is particularly useful for hypothesis generation in identifying genes that may be important to several of the most affected processes.

## To color genes by log2 fold changes, we need to extract the log2 fold changes from our results table creating a named vector
```{r}
OE_foldchanges <- sigOE$log2FoldChange

names(OE_foldchanges) <- sigOE$symbol
```


```{r}
## Cnetplot details the genes associated with one or more terms - by default gives the top 5 significant terms (by padj)
cnetplot(ego, 
         categorySize="pvalue", 
         showCategory = 5, 
         foldChange=OE_foldchanges, 
         vertex.label.font=3,
         ggrepel.max.overlaps = Inf,
          )
```


```{r}
ego2 <- data.frame(ego)
View(ego2)
```

If you are interested in significant processes that are not among the top five, you can subset your ego dataset to only display these processes:


```{r}
# find index to subset
which(ego2$ID == "GO:0042438") #melanin biosynthetic process
which(ego2$ID == "GO:0048021") #regulation of melanin biosynthetic 
which(ego2$ID == "GO:0010631") #epithelial cell migration
which(ego2$ID == "GO:0043473") #Pigmentation
which(ego2$ID == "GO:0050673") #epithelial cell proliferation
which(ego2$ID == "GO:0043588") #skin development
```



```{r}
## Subsetting the ego results without overwriting original `ego` variable
ego3 <- ego

ego3@result <- ego@result[c(140
,611,14,71,45),]

## Plotting terms of interest
cnetplot(ego3, 
         categorySize="pvalue", 
         foldChange=OE_foldchanges, 
         showCategory = 5, 
         vertex.label.font=6)

```

```{r}
head(ego3)
```

```{r}
## convert gene ID to Symbol
edox <- setReadable(ego, 'org.Hs.eg.db', 'ENTREZID')
p1 <- cnetplot(edox, foldChange=OE_foldchanges)

## categorySize can be scaled by 'pvalue' or 'geneNum'
p2 <- cnetplot(edox, categorySize="pvalue", foldChange=OE_foldchanges)
p3 <- cnetplot(edox,showCategory = 3, foldChange=OE_foldchanges, circular = TRUE, colorEdge = TRUE) 
cowplot::plot_grid(p1, p2, p3, ncol=3, labels=LETTERS[1:3], rel_widths=c(.8, .8, 1.2))
```


```{r}
cowplot::plot_grid( p3, rel_widths=c(.8, .8, 1.2))
```


### Gene set Enrichement analysis using GSEA Java-based

#### preparing normalized counts for GSEA 
```{r}
#convert norm_counts to df
norm_counts_df <- data.frame(norm_counts)
```



```{r}
norm_counts_GSEA <- norm_counts_df %>% mutate(desc= NA, .before="NHEM76")
```

```{r}
head(norm_counts_GSEA)
```

```{r}
# give the genes coloumn a proper name
norm_counts_GSEA <- cbind(ID  = rownames(norm_counts_GSEA), norm_counts_GSEA)
head(norm_counts_GSEA)
```


```{r}
# remove original rownames
#rownames(norm_counts_GSEA ) <- NULL
```
 
```{r}
head(norm_counts_GSEA)
```
 

```{r}
#save the normalized counts in a table ready for GSEA Java based program
write.table(norm_counts_GSEA, file = "GSEA/Normal2_GSEA.txt", sep = "\t", row.names = F)

```



```{r}
#write.table(norm_counts, file = "GSEA/norm_counts.tsv", sep= "\t", col.names = NA)

```


```{r}
export(norm_counts, rowNames = T, "GSEA/norm_counts.xlsx")
```



#### preparing normalized counts for GSEA 
```{r}
#convert norm_counts to df
norm_counts_df <- data.frame(norm_counts)
normal_Radial_GSEA <- norm_counts_df %>% dplyr::select(1:4)
head(normal_Radial_GSEA)
```

```{r}
normal_Radial_GSEA <- normal_Radial_GSEA %>% mutate(desc= NA, .before="NHEM76")
```

```{r}
head(normal_Radial_GSEA)
```

```{r}
# give the genes coloumn a proper name
normal_Radial_GSEA <- cbind(ID  = rownames(normal_Radial_GSEA), normal_Radial_GSEA)
head(normal_Radial_GSEA)
```



```{r}
# remove original rownames
rownames(normal_Radial_GSEA ) <- NULL
```
 
```{r}
head(normal_Radial_GSEA)
```
 

```{r}
#save the normalized NormalVSRadial counts in a table
write.table(normal_Radial_GSEA, file = "normal_Rad_GSEA.txt", sep = "\t", row.names = F)

```


```{r}
export(normal_Radial_GSEA, "normal_Rad_GSEA.xlsx")
```

```{r}
class(normal_Radial_GSEA)
```

```{r}
head(normal_Radial_GSEA)
```




```{r}
#DAVID1 <- read.table("DAVID/Norm_Rad/chart_AAD5D588820E1657803685183.tsv", sep ="\t", header = T)
```



```{r}
#head(DAVID1)
```

```{r}
#barplot
#ggplot(DAVID1[1:5], aes(x=Count, y = Term))
#+geom_bar()
```




### GSEA
Steps toward doing gene set enrichment analysis (GSEA):

 1- obtaining stats for ranking genes in your experiment,
 2- creating a named vector out of the DESeq2 result
 3- Obtaining a gene set from mysigbd
 4- doing analysis

already we performed DESeq2 analysis and have statistics for working on it

```{r}
#dds1_GSEA<- results(dds, 
                   contrast = c("con", "radial", "normal"),
                    alpha = 0.05)
```


```{r}
head(dds1_GSEA)
```

```{r}
head(dds1_res_anno)
```
```{r}
dds1_GSEA_df <- data.frame(dds1_GSEA)
```


```{r}

dds1_GSEA_anno <- rownames_to_column(dds1_GSEA_df, var = "ensgene") 
head(dds1_GSEA_anno)
```

```{r}
dds1_GSEA_anno <- left_join(x = dds1_GSEA_anno,
                 y = grch38[, c("ensgene", "symbol", "description")],
                  by = c("ensgene"))
head(dds1_GSEA_anno)
```

#### Remove Duplicates
```{r}
#non_duplicates <- which(duplicated(ids$symbol) == FALSE)

```




```{r}
# remove the NAs, averaging statitics for a multi-hit symbol
dds1_GSEA_final <- dds1_GSEA_anno %>% 
  dplyr::select(symbol, stat) %>% 
  distinct() %>% 
  group_by(symbol) %>% 
  summarize(stat=mean(stat))
```

```{r}
head(dds1_GSEA_final)
```

# creating  a named vector [ranked genes]

```{r}
ranks <- dds1_GSEA_final$stat
names(ranks) <- dds1_GSEA_final$symbol
```

```{r}
#BiocManager::install("fgsea")
library("fgsea")
```


```{r}
# Load the pathway (gene set) into a named list
# downloaded mysigdb
pathways.hallmark <- gmtPathways("GSEA/GSEA_R/h.all.v7.5.1.symbols.gmt")

```

```{r}
# show a few lines from the pathways file
head(pathways.hallmark)
```

```{r}
#Running fgsea algorithm:
fgseaRes1 <- fgseaMultilevel(pathways=pathways.hallmark, stats=ranks)
```

```{r}
# Tidy the results:
fgseaResTidy <- fgseaRes1 %>%
  as_tibble() %>%
  arrange(desc(NES)) # order by normalized enrichment score (NES)

```


```{r}
# To see what genes are in each of these pathways:
gene_in_pathway <- pathways.hallmark %>% 
  enframe("pathway", "symbol") %>% 
  unnest(cols = c(symbol)) %>% 
  inner_join(dds1_GSEA_final, by="symbol")
```


# VISUALIZATION

```{r}

#__________bar plot _______________#
# Plot the normalized enrichment scores. 
#Color the bar indicating whether or not the pathway was significant:
fgseaResTidy$adjPvalue <- ifelse(fgseaResTidy$padj <= 0.05, "significant", "non-significant")
cols <- c("non-significant" = "grey", "significant" = "red")
ggplot(fgseaResTidy, aes(reorder(pathway, NES), NES, fill = adjPvalue)) +
  geom_col() +
  scale_fill_manual(values = cols) +
  theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1)) +
  coord_flip() +
  labs(x="Pathway", y="Normalized Enrichment Score",
  title="Hallmark pathways Enrichment Score from GSEA")
```


```{r}
#__________Enrichment  Plot_______#
# Enrichment plot for E2F target gene set
plotEnrichment(pathway = pathways.hallmark[["HALLMARK_E2F_TARGETS"]], ranks)
```


```{r}
plotGseaTable(pathways.hallmark[fgseaRes1$pathway[fgseaRes1$padj < 0.05]], ranks, fgseaRes1, 
              gseaParam=0.5)
```



#### Over representation analysis using clusterProfiler and MSigDB
```{r}
hallmark <- read.gmt("GSEA/GSEA_R/h.all.v7.5.1.symbols.gmt")
```


```{r}
head(hallmark)
```


```{r}
#install.packages("msigdbr")
library(msigdbr)
msigdbr_show_species()
```

#### Retrieveing Human gene set
```{r}
Human_df <- msigdbr(species = "Homo sapiens")
head(Human_df, 2) %>% as.data.frame
```

#### using Hallmark :
```{r}
hm_t2g <- msigdbr(species = "Homo sapiens", category = "H")%>%  dplyr::select(gs_name, ensembl_gene)
head(hm_t2g)
```


#### MSigDb over-presentaton analysis
```{r}
hm_ora <- enricher(sigOE_genes, TERM2GENE=hm_t2g)
head(hm_ora)
```

```{r}
dotplot(hm_ora, showCategory=25, title = "Hallmarks",label_format = 55)

```
```{r}
barplot(hm_ora, showCategory = 15)
```

#### ORA Hallmarks RGP vs VGP 
```{r}
## Create background dataset for hypergeometric testing using all genes tested for significance in the results                 
allOE2_genes <- as.character(dds2_res_anno$ensgene)

## Extract significant results
sigOE2 <- dplyr::filter(dds2_res_anno, padj < 0.05)

sigOE2_genes <- as.character(sigOE2$ensgene) #genes vector
```




```{r}
hm_ora_RGP_VGP <- enricher(sigOE2_genes, TERM2GENE=hm_t2g)
View(as.data.frame(hm_ora_RGP_VGP))
```

```{r}
barplot(hm_ora_RGP_VGP)
```


```{r}
head(sigOE2_genes)
```

#### ORA Hallmarks VGP vs MET 
```{r}
## Create background dataset for hypergeometric testing using all genes tested for significance in the results                 
allOE3_genes <- as.character(dds3_res_anno$ensgene)

## Extract significant results
sigOE3 <- dplyr::filter(dds3_res_anno, padj < 0.05)

sigOE3_genes <- as.character(sigOE3$ensgene) #genes vector
```

```{r}
hm_ora_VGP_MET <- enricher(sigOE3_genes, TERM2GENE=hm_t2g)
View(as.data.frame(hm_ora_VGP_MET))
```

```{r}
barplot(hm_ora_VGP_MET)
```





Multiple comparison overtime #cancelled.
how the genes behave overtime #cancelled. 
time dependancy #cancelled  .
time series #cancelled 
we have 7 gene sets GO term list based on genes 
vis of diff exp in each step

#writing #introduction #material and methods  
ref 
bioc
hashes with work illustration 








